# Define a cross validation strategy # We use the cross_val_score function of Sklearn. # However this function has not a shuffle attribute, we add then one line of code, # in order to shuffle the dataset prior to cross-validation
# Split data into training and testing sets X_train, X_valid, y_train, y_valid = train_test_split( df.drop(columns="TARGET"), df["TARGET"], test_size=0.25, random_state=SEED )
# Specific feature names and categorical features feature_name = X_train.columns.tolist() categorical_feature = X_train.select_dtypes('category').columns.tolist()
from sklearn.preprocessing import OneHotEncoder from sklearn.compose import make_column_transformer
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[147] valid_0's auc: 0.860844 valid_1's auc: 0.778985
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[99] valid_0's auc: 0.836905 valid_1's auc: 0.777066
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[121] valid_0's auc: 0.846901 valid_1's auc: 0.777927
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[118] valid_0's auc: 0.846341 valid_1's auc: 0.778487
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[114] valid_0's auc: 0.845653 valid_1's auc: 0.776624
valid auc: 0.779 +/- 0.001
evaluate cost time 0 hours 1 minutes 57 seconds
model 2: Set class weight
model2 = clone(model) # Construct a new unfitted estimator with the same parameters. model2.set_params(class_weight='balanced')
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[122] valid_0's auc: 0.843105 valid_1's auc: 0.780157
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[95] valid_0's auc: 0.831016 valid_1's auc: 0.780049
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[107] valid_0's auc: 0.835709 valid_1's auc: 0.779769
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[159] valid_0's auc: 0.856821 valid_1's auc: 0.781057
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[138] valid_0's auc: 0.848312 valid_1's auc: 0.779905
valid auc: 0.780 +/- 0.002
evaluate cost time 0 hours 2 minutes 20 seconds
设置 is_unbalance=True 后,模型有所改善。
model 3: SMOTE
from imblearn.over_sampling import SMOTE import imblearn
X_balanced, y_balanced = SMOTE(random_state=SEED).fit_resample(train_dummies, y_train) print('balanced train data shape:', X_balanced.shape)
balanced train data shape: (423998, 990)
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[64] valid_0's auc: 0.726936 valid_1's auc: 0.7216
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[138] valid_0's auc: 0.834743 valid_1's auc: 0.780546
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[167] valid_0's auc: 0.849441 valid_1's auc: 0.782093
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[140] valid_0's auc: 0.834219 valid_1's auc: 0.780796
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[166] valid_0's auc: 0.848353 valid_1's auc: 0.780799
valid auc: 0.976 +/- 0.048
evaluate cost time 0 hours 5 minutes 46 seconds
model 4: Ensemble method
from imblearn.ensemble import BalancedRandomForestClassifier
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[116] valid_0's auc: 0.840709 valid_0's focal_loss: 0.0792912 valid_1's auc: 0.780966 valid_1's focal_loss: 0.0886921
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[87] valid_0's auc: 0.82691 valid_0's focal_loss: 0.0816416 valid_1's auc: 0.779874 valid_1's focal_loss: 0.0888508
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[101] valid_0's auc: 0.832985 valid_0's focal_loss: 0.0805644 valid_1's auc: 0.779294 valid_1's focal_loss: 0.0889485
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[87] valid_0's auc: 0.827538 valid_0's focal_loss: 0.0816012 valid_1's auc: 0.78189 valid_1's focal_loss: 0.0885146
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[119] valid_0's auc: 0.840904 valid_0's focal_loss: 0.0792486 valid_1's auc: 0.781548 valid_1's focal_loss: 0.0886565
valid auc: nan +/- nan
evaluate cost time 0 hours 2 minutes 16 seconds
# In LightGBM, the validation data should be aligned with training data. # if you want to re-use data, remember to set free_raw_data=False dvalid = lgb.Dataset( X_valid, label=y_valid, reference=dtrain, free_raw_data=True )
超参数和目标函数设置
# Here we use Optuna
# define the search space and the objecive function defobjective(trial): # LightGBM can use a dictionary to set Parameters. params = dict( boosting_type = 'gbdt', objective = 'binary', metric = 'auc', is_unbalance = True, num_boost_round = trial.suggest_int("num_boost_round", 50, 2000, step=50), learning_rate = trial.suggest_float("learning_rate", 1e-4, 10, log=True), max_depth = trial.suggest_int("max_depth", 2, 10), feature_fraction = trial.suggest_float("feature_fraction", 0.2, 1.0), bagging_fraction = trial.suggest_float("bagging_fraction", 0.2, 1.0), bagging_freq = 5, lambda_l1 = trial.suggest_float("lambda_l1", 1e-4, 1e2, log=True), lambda_l2 = trial.suggest_float("lambda_l2", 1e-4, 1e2, log=True), random_state = SEED, verbosity = -1 ) # Perform the cross-validation with given parameters. eval_results = lgb.cv( params, dtrain, nfold = 5, shuffle = True, feature_name = feature_name, categorical_feature = categorical_feature, callbacks=[lgb.early_stopping(20)] ) return eval_results['valid auc-mean'][-1]
贝叶斯优化
# Bayesian optimization
# create a study object. study = optuna.create_study( study_name = 'lightgbm-study', # Unique identifier of the study. direction = 'maximize' )
print("Best trial until now:") print(" Value: ", study.best_trial.value) print(" Params: ") for key, value in study.best_trial.params.items(): print(f" {key}: {value}")
# Continue to study study.optimize( objective, n_trials = 100, timeout = 7200, gc_after_trial = True, show_progress_bar = True )
print("Number of finished trials: ", len(study.trials)) print("Best trial until now:") print(" Best value: ", study.best_trial.value) print(" Best params: ") for key, value in study.best_trial.params.items(): print(f" {key}: {value}")
Number of finished trials: 135
Best trial until now:
Best value: 0.7865747325768904
Best params:
num_boost_round: 1300
learning_rate: 0.015480784915810246
max_depth: 8
feature_fraction: 0.3519165350962246
bagging_fraction: 0.9999568798413535
lambda_l1: 65.08840723355036
lambda_l2: 15.024421566966097
# In LightGBM, the validation data should be aligned with training data. # if you want to re-use data, remember to set free_raw_data=False dvalid = lgb.Dataset( X_valid, label=y_valid, reference=dtrain, free_raw_data=True )
Starting training...
[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).
[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).
[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).
Training until validation scores don't improve for 20 rounds
[100] training's auc: 0.77831 valid_1's auc: 0.760952
[200] training's auc: 0.793115 valid_1's auc: 0.770076
[300] training's auc: 0.803729 valid_1's auc: 0.775631
[400] training's auc: 0.811797 valid_1's auc: 0.778893
[500] training's auc: 0.818789 valid_1's auc: 0.78126
[600] training's auc: 0.825071 valid_1's auc: 0.782986
[700] training's auc: 0.830958 valid_1's auc: 0.784242
[800] training's auc: 0.836567 valid_1's auc: 0.785216
[900] training's auc: 0.841761 valid_1's auc: 0.785837
[1000] training's auc: 0.846603 valid_1's auc: 0.786335
[1100] training's auc: 0.851281 valid_1's auc: 0.786744
Early stopping, best iteration is:
[1118] training's auc: 0.852112 valid_1's auc: 0.786804
可视化
# Plotting metrics recorded during training ax = lgb.plot_metric(eval_results, metric='auc') plt.show()
PSI(Population Stability Index)指标反映了实际分布(actual)与预期分布(expected)的差异。在建模中,我们常用来筛选特征变量、评估模型稳定性。其中,在建模时通常以训练样本(In the Sample, INS)作为预期分布,而验证样本在各分数段的分布通常作为实际分布。验证样本一般包括样本外(Out of Sample, OOS)和跨时间样本(Out of Time, OOT)。
风控模型常用PSI衡量模型的稳定性。
defcalc_psi(expected, actual, n_bins=10): ''' Calculate the PSI (Population Stability Index) for two vectors. Args: expected: array-like, represents the expected distribution. actual: array-like, represents the actual distribution. bins: int, the number of bins to use in the histogram. Returns: float, the PSI value. ''' # Calculate the expected frequencies in each bin buckets, bins = pd.qcut(expected, n_bins, retbins=True, duplicates='drop') expected_freq = buckets.value_counts() expected_freq = expected_freq / expected_freq.sum() # Calculate the actual frequencies in each bin bins = [-np.inf] + list(bins)[1: -1] + [np.inf] actual_freq = pd.cut(actual, bins).value_counts() actual_freq = actual_freq / actual_freq.sum() # Calculate PSI psi = (actual_freq - expected_freq) * np.log(actual_freq / expected_freq) return psi.sum()
# Plotting 54th tree (one tree use categorical feature to split) # ax = lgb.plot_tree(bst, tree_index=53, figsize=(15, 15), show_info=['split_gain']) # plt.show()
# Plotting 54th tree with graphviz # graph = lgb.create_tree_digraph(bst, tree_index=53, name='Tree54') # graph.render(view=True)
Step 9: Model persistence
# Save model to file print('Saving model...') bst.save_model(path + 'lgb_model.txt')
Saving model...
<lightgbm.basic.Booster at 0x2c457d3a0>
Step 10: Predict
# Perform predictions # If early stopping is enabled during training, you can get predictions from the best iteration with bst.best_iteration. predictions = bst.predict(X_valid, num_iteration=bst.best_iteration)
# Load a saved model to predict print('Loading model to predict...') bst = lgb.Booster(model_file=path + 'lgb_model.txt') predictions = bst.predict(X_valid)
Loading model to predict...
# Save predictions # predictions.to_csv('valid_predictions.csv', index=True)
Appendices: FocalLoss
import numpy as np from scipy import optimize, special classBinaryFocalLoss: def__init__(self, gamma, alpha=None): # 使用FocalLoss只需要设定以上两个参数,如果alpha=None,默认取值为1 self.alpha = alpha self.gamma = gamma