Scikit-learn¶
Baseline¶
In [1]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import time
import warnings
# Setting configuration.
warnings.filterwarnings('ignore')
SEED = 42
In [2]:
# Load dataset
path = '../../datasets/Home-Credit-Default-Risk/'
data = pd.read_csv(path + 'prepared_data.csv', index_col='SK_ID_CURR')
In [3]:
data.shape
Out[3]:
(307511, 158)
In [4]:
data.groupby('TARGET').size()
Out[4]:
TARGET 0 282686 1 24825 dtype: int64
In [5]:
X_train, X_test, y_train, y_test = train_test_split(
data.drop('TARGET', axis=1),
data['TARGET'],
test_size=0.25
)
In [6]:
gbc = HistGradientBoostingClassifier(
class_weight = 'balanced',
scoring = 'roc_auc',
max_iter = 1000,
max_depth = 8,
max_features = 0.35,
learning_rate = 0.015,
l2_regularization = 15,
n_iter_no_change = 20,
random_state = SEED,
verbose = 0
)
gbc.fit(X_train, y_train)
Out[6]:
HistGradientBoostingClassifier(class_weight='balanced', l2_regularization=15, learning_rate=0.015, max_depth=8, max_features=0.35, max_iter=1000, n_iter_no_change=20, random_state=42, scoring='roc_auc')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
HistGradientBoostingClassifier(class_weight='balanced', l2_regularization=15, learning_rate=0.015, max_depth=8, max_features=0.35, max_iter=1000, n_iter_no_change=20, random_state=42, scoring='roc_auc')
In [7]:
train_auc = roc_auc_score(y_train, gbc.predict_proba(X_train)[:, 1])
test_auc = roc_auc_score(y_test, gbc.predict_proba(X_test)[:, 1])
print(f"train's auc: {train_auc:.4f}")
print(f"test's auc: {test_auc:.4f}")
train's auc: 0.7950 test's auc: 0.7643
设置 warm_start 参数增加新树¶
In [8]:
from sklearn.ensemble import GradientBoostingClassifier
In [9]:
# Create data stream
def get_minibatch(minibatch_size):
path = '../../datasets/Home-Credit-Default-Risk/'
return pd.read_csv(
path + 'prepared_data.csv',
index_col='SK_ID_CURR',
chunksize = minibatch_size # return iterator
)
In [10]:
gbdt = GradientBoostingClassifier(
learning_rate = 0.015,
n_estimators = 0,
subsample = 1.0,
max_features = 0.35,
max_depth = 8,
n_iter_no_change = 20,
warm_start = True,
random_state = SEED,
verbose = 0
)
In [11]:
def gbdt_with_warm_start(X_train, y_train, X_test, y_test, i=None):
gbdt.n_estimators += 100
gbdt.fit(X_train, y_train)
train_pred = gbdt.predict_proba(X_train)[:, 1]
test_pred = gbdt.predict_proba(X_test)[:, 1]
return train_pred, test_pred, gbdt.n_estimators
In [12]:
def incremental_learning(iterator, refresh):
test = iterator.get_chunk(size = 75000)
print('test data shape:', test.shape)
X_test = test.drop('TARGET', axis=1)
y_test = test['TARGET']
tick = time.time()
n_train = 0
auc_history = []
# Main loop : iterate on mini-batches of examples
for i, train in enumerate(iterator):
X_train = train.drop('TARGET', axis = 1)
y_train = train['TARGET']
n_train += X_train.shape[0]
# update model with examples in the current mini-batch
train_pred, test_pred, num_trees = refresh(X_train, y_train, X_test, y_test, i)
duration = time.time() - tick
train_auc = roc_auc_score(y_train, train_pred)
test_auc = roc_auc_score(y_test, test_pred)
# report progress information
if num_trees is None:
num = n_train
condition = f'{n_train} train samples'
else:
num = num_trees
condition = f'{num_trees} trees'
auc_history.append((num, train_auc, test_auc))
progress = f"{condition}, valid's auc: {test_auc:.4f} in {duration:.2f}s"
print(progress)
print(f"finally:")
print(f" train's auc: {train_auc:.4f}")
print(f" valid's auc: {test_auc:.4f}")
return auc_history
In [13]:
batch_size = 20000
minibatch_iterator = get_minibatch(minibatch_size = batch_size)
auc_history = incremental_learning(minibatch_iterator, gbdt_with_warm_start)
test data shape: (75000, 158) 100 trees, valid's auc: 0.7274 in 12.06s 200 trees, valid's auc: 0.7359 in 23.70s 300 trees, valid's auc: 0.7378 in 35.58s 400 trees, valid's auc: 0.7387 in 47.15s 500 trees, valid's auc: 0.7396 in 59.23s 600 trees, valid's auc: 0.7390 in 71.42s 700 trees, valid's auc: 0.7390 in 83.76s 800 trees, valid's auc: 0.7390 in 96.39s 900 trees, valid's auc: 0.7376 in 109.14s 1000 trees, valid's auc: 0.7375 in 122.00s 1100 trees, valid's auc: 0.7350 in 135.21s 1200 trees, valid's auc: 0.7316 in 144.29s finally: train's auc: 0.8363 valid's auc: 0.7316
In [14]:
def plot_score_evolution(auc_history, xlable):
"""xlabel: training examples (#) or num trees"""
plt.figure()
ticks, train_auc, test_auc = zip(*auc_history)
plt.title('Metric during incremental learning')
plt.xlabel(xlable)
plt.ylabel('auc')
plt.grid(True)
plt.plot(ticks, train_auc, label='train')
plt.plot(ticks, test_auc, label='test')
plt.gca()
plt.legend(loc='best', title='auc')
In [15]:
plot_score_evolution(auc_history, 'num trees')
plt.show()
调用 partial_fit 方法刷新叶节点¶
In [16]:
from sklearn.linear_model import SGDClassifier
In [17]:
num_rounds = 1200 # 同上节 warm_start 树的数量
sgd = SGDClassifier(
class_weight = {1: 11, 0: 1},
loss='log_loss',
alpha = 0.01,
max_iter = num_rounds,
penalty='elasticnet',
l1_ratio = 0.5
)
In [18]:
def sgd_partial_fit(X_train, y_train, X_test, y_test, i=None):
sgd.partial_fit(X_train, y_train, classes=[1, 0])
train_pred = sgd.predict_proba(X_train)[:, 1]
test_pred = sgd.predict_proba(X_test)[:, 1]
return train_pred, test_pred, None
minibatch_iterator = get_minibatch(minibatch_size = batch_size)
auc_history = incremental_learning(minibatch_iterator, sgd_partial_fit)
test data shape: (75000, 158) 20000 train samples, valid's auc: 0.5038 in 0.19s 40000 train samples, valid's auc: 0.5174 in 0.45s 60000 train samples, valid's auc: 0.5004 in 0.71s 80000 train samples, valid's auc: 0.5284 in 0.97s 100000 train samples, valid's auc: 0.5014 in 1.22s 120000 train samples, valid's auc: 0.5002 in 1.46s 140000 train samples, valid's auc: 0.5175 in 1.71s 160000 train samples, valid's auc: 0.5003 in 1.98s 180000 train samples, valid's auc: 0.5461 in 2.22s 200000 train samples, valid's auc: 0.5458 in 2.45s 220000 train samples, valid's auc: 0.5323 in 2.73s 232511 train samples, valid's auc: 0.5009 in 2.91s finally: train's auc: 0.5020 valid's auc: 0.5009
运行结果展示:
In [19]:
plot_score_evolution(auc_history, 'training examples (#)')
plt.show()
XGBoost¶
XGBoost 提供两种增量学习的方式:
- 一种是在当前迭代树的基础上增加新树,原树不变;
- 一种是当前迭代树结构不变,重新计算叶节点权重和 / 或叶节点值。
In [20]:
import xgboost as xgb
设置 xgb_model 参数增加新树¶
In [21]:
# specify parameters via map
params = dict(
booster = 'gbtree',
objective = 'binary:logistic',
eval_metric = 'auc',
scale_pos_weight = 11,
learning_rate = 0.015,
max_depth = 8,
subsample = 1.0,
colsample_bytree = 0.35,
reg_alpha = 65,
reg_lambda = 15,
seed = SEED,
verbosity = 0
)
In [22]:
bst = None # init model
# Train 1000 iterations, with the each chunk runs for 100 iterations.
def xgb_continue(X_train, y_train, X_test, y_test, i=None):
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
global bst
bst = xgb.train(
params,
dtrain,
num_boost_round = 100,
xgb_model = bst,
evals = [(dtrain, "train")],
callbacks = [xgb.callback.EarlyStopping(20)],
verbose_eval = 0
)
train_pred = bst.predict(dtrain)
test_pred = bst.predict(dtest)
return train_pred, test_pred, bst.num_boosted_rounds()
minibatch_iterator = get_minibatch(minibatch_size = batch_size)
auc_history = incremental_learning(minibatch_iterator, xgb_continue)
test data shape: (75000, 158) 100 trees, valid's auc: 0.7279 in 1.54s 200 trees, valid's auc: 0.7374 in 3.09s 300 trees, valid's auc: 0.7414 in 4.70s 400 trees, valid's auc: 0.7455 in 6.98s 500 trees, valid's auc: 0.7477 in 9.08s 600 trees, valid's auc: 0.7486 in 10.95s 700 trees, valid's auc: 0.7504 in 12.83s 800 trees, valid's auc: 0.7505 in 14.80s 900 trees, valid's auc: 0.7499 in 16.78s 1000 trees, valid's auc: 0.7505 in 18.87s 1100 trees, valid's auc: 0.7514 in 20.99s 1200 trees, valid's auc: 0.7518 in 22.87s finally: train's auc: 0.7873 valid's auc: 0.7518
In [23]:
plot_score_evolution(auc_history, 'num trees')
plt.show()
刷新叶子节点¶
使用 process_type 参数更新叶节点数值
In [24]:
# specify parameters via map
params = dict(
process_type = 'default', # Set `process_type` to `default` if you want to build new trees.
booster = 'gbtree',
objective = 'binary:logistic',
eval_metric = 'auc',
scale_pos_weight = 11,
learning_rate = 0.015,
max_depth = 8,
subsample = 1.0,
colsample_bytree = 0.35,
reg_alpha = 65,
reg_lambda = 15,
seed = SEED,
verbosity = 0
)
In [25]:
bst = None # init model
# The model will adapt to new data by changing leaf value (no change in split condition)
def xgb_refresh(X_train, y_train, X_test, y_test, i=None):
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# update estimator with examples in the current mini-batch
global bst
bst = xgb.train(
params,
dtrain,
num_boost_round = num_rounds,
xgb_model = bst,
evals = [(dtrain, "train")],
verbose_eval = 0
)
train_pred = bst.predict(dtrain)
test_pred = bst.predict(dtest)
if i == 0 :
params['process_type'] = "update"
params["updater"] = "refresh"
params["refresh_leaf"] = True # Refresh the leaf value and tree statistic
return train_pred, test_pred, None
minibatch_iterator = get_minibatch(minibatch_size = batch_size)
auc_history = incremental_learning(minibatch_iterator, xgb_refresh)
test data shape: (75000, 158) 20000 train samples, valid's auc: 0.7369 in 9.14s 40000 train samples, valid's auc: 0.7419 in 17.04s 60000 train samples, valid's auc: 0.7411 in 25.00s 80000 train samples, valid's auc: 0.7428 in 33.05s 100000 train samples, valid's auc: 0.7414 in 41.19s 120000 train samples, valid's auc: 0.7422 in 49.49s 140000 train samples, valid's auc: 0.7431 in 57.51s 160000 train samples, valid's auc: 0.7413 in 65.84s 180000 train samples, valid's auc: 0.7404 in 74.43s 200000 train samples, valid's auc: 0.7416 in 82.70s 220000 train samples, valid's auc: 0.7410 in 90.92s 232511 train samples, valid's auc: 0.7394 in 96.88s finally: train's auc: 0.7557 valid's auc: 0.7394
In [26]:
plot_score_evolution(auc_history, 'training examples (#)')
plt.show()
LightGBM¶
lightGBM 有两种方法控制增量学习模式:
- 如果 init_model 不为 None,将从原有模型基础上继续训练,添加 num_boost_round 棵新树
- 调用 refit 任务,将在原有模型的树结构都不变的基础上,重新拟合新数据更新叶子节点权重
设置 init_model 参数增加新树¶
In [27]:
import lightgbm as lgb
In [28]:
# specify parameters via map
params = dict(
boosting_type = 'gbdt',
objective = 'binary',
metric = 'auc',
is_unbalance = True,
learning_rate = 0.015,
max_depth = 8,
feature_fraction = 0.35,
bagging_fraction = 1.0,
lambda_l1 = 65,
lambda_l2 = 15,
subsample_freq = 5,
random_state = SEED,
verbosity = -1
)
In [29]:
gbm = None # init model
# Train 1200 iterations, with the each chunk runs for 100 iterations.
def lgb_continue(X_train, y_train, X_test, y_test, i=None):
dtrain = lgb.Dataset(X_train, label=y_train)
dtest = lgb.Dataset(X_test, label=y_test)
global gbm
gbm = lgb.train(
params,
dtrain,
num_boost_round = 100,
init_model = gbm,
valid_sets = [dtrain],
callbacks = [lgb.early_stopping(stopping_rounds=20)],
keep_training_booster=True
)
train_pred = gbm.predict(X_train)
test_pred = gbm.predict(X_test)
return train_pred, test_pred, gbm.num_trees()
minibatch_iterator = get_minibatch(minibatch_size = batch_size)
auc_history = incremental_learning(minibatch_iterator, lgb_continue)
test data shape: (75000, 158) 100 trees, valid's auc: 0.7179 in 0.67s 200 trees, valid's auc: 0.7301 in 1.59s 300 trees, valid's auc: 0.7372 in 2.68s 400 trees, valid's auc: 0.7420 in 3.92s 500 trees, valid's auc: 0.7447 in 5.32s 600 trees, valid's auc: 0.7461 in 6.83s 700 trees, valid's auc: 0.7482 in 8.52s 800 trees, valid's auc: 0.7486 in 10.33s 900 trees, valid's auc: 0.7488 in 12.28s 1000 trees, valid's auc: 0.7494 in 14.39s 1100 trees, valid's auc: 0.7500 in 16.64s 1200 trees, valid's auc: 0.7506 in 18.64s finally: train's auc: 0.7768 valid's auc: 0.7506
其中 keep_training_booster (bool) 参数表示返回的模型 (booster) 是否将用于保持训练,默认 False。当模型非常大并导致内存错误时,可以尝试将此参数设置为 True,以避免 model_to_string 转换。然后仍然可以使用返回的 booster 作为 init_model,用于未来的继续训练。
In [30]:
plot_score_evolution(auc_history, 'num trees')
plt.show()
刷新叶子节点¶
In [31]:
# specify parameters via map
params = dict(
boosting_type = 'gbdt',
objective = 'binary',
metric = 'auc',
is_unbalance = True,
learning_rate = 0.015,
max_depth = 8,
feature_fraction = 0.35,
bagging_fraction = 1.0,
lambda_l1 = 65,
lambda_l2 = 15,
subsample_freq = 5,
random_state = SEED,
verbosity = -1
)
In [32]:
gbm = None # init model
# The model will adapt to new data by changing leaf value (no change in split condition)
def lgb_refit(X_train, y_train, X_test, y_test, i=None):
dtrain = lgb.Dataset(X_train, label=y_train)
dtest = lgb.Dataset(X_test, label=y_test)
# update estimator with examples in the current mini-batch
global gbm
gbm = lgb.train(
params,
dtrain,
num_boost_round = num_rounds,
init_model = gbm,
valid_sets = [dtrain],
keep_training_booster=True
)
train_pred = gbm.predict(X_train)
test_pred = gbm.predict(X_test)
if i == 0:
params['task'] = 'refit'
params['refit_decay_rate'] = 0.9
return train_pred, test_pred, None
minibatch_iterator = get_minibatch(minibatch_size = batch_size)
auc_history = incremental_learning(minibatch_iterator, lgb_refit)
test data shape: (75000, 158) 20000 train samples, valid's auc: 0.7373 in 5.65s 40000 train samples, valid's auc: 0.7423 in 12.74s 60000 train samples, valid's auc: 0.7423 in 21.02s 80000 train samples, valid's auc: 0.7424 in 30.64s 100000 train samples, valid's auc: 0.7438 in 42.43s 120000 train samples, valid's auc: 0.7409 in 56.15s 140000 train samples, valid's auc: 0.7419 in 72.42s 160000 train samples, valid's auc: 0.7394 in 91.47s 180000 train samples, valid's auc: 0.7363 in 114.02s 200000 train samples, valid's auc: 0.7386 in 140.03s 220000 train samples, valid's auc: 0.7360 in 170.18s 232511 train samples, valid's auc: 0.7347 in 198.44s finally: train's auc: 0.8629 valid's auc: 0.7347
其中 refit_decay_rate 控制 refit 任务中叶节点输出的衰减率。重新拟合后,叶子结点的输出的计算公式为:
leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output
In [33]:
plot_score_evolution(auc_history, 'training examples (#)')
plt.show()