增量学习

Scikit-learn¶

Baseline¶

In [1]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import time
import warnings

# Setting configuration.
warnings.filterwarnings('ignore')

SEED = 42
In [2]:
# Load dataset
path = '../../datasets/Home-Credit-Default-Risk/'
data = pd.read_csv(path + 'prepared_data.csv', index_col='SK_ID_CURR')
In [3]:
data.shape
Out[3]:
(307511, 158)
In [4]:
data.groupby('TARGET').size()
Out[4]:
TARGET
0    282686
1     24825
dtype: int64
In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('TARGET', axis=1), 
    data['TARGET'], 
    test_size=0.25
)
In [6]:
gbc = HistGradientBoostingClassifier(
    class_weight = 'balanced',
    scoring = 'roc_auc',
    max_iter = 1000, 
    max_depth = 8,
    max_features = 0.35,
    learning_rate = 0.015,
    l2_regularization = 15,
    n_iter_no_change = 20,
    random_state = SEED,
    verbose = 0
)
gbc.fit(X_train, y_train)
Out[6]:
HistGradientBoostingClassifier(class_weight='balanced', l2_regularization=15,
                               learning_rate=0.015, max_depth=8,
                               max_features=0.35, max_iter=1000,
                               n_iter_no_change=20, random_state=42,
                               scoring='roc_auc')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
HistGradientBoostingClassifier(class_weight='balanced', l2_regularization=15,
                               learning_rate=0.015, max_depth=8,
                               max_features=0.35, max_iter=1000,
                               n_iter_no_change=20, random_state=42,
                               scoring='roc_auc')
In [7]:
train_auc = roc_auc_score(y_train, gbc.predict_proba(X_train)[:, 1])
test_auc = roc_auc_score(y_test, gbc.predict_proba(X_test)[:, 1])
print(f"train's auc: {train_auc:.4f}")
print(f"test's auc: {test_auc:.4f}")
train's auc: 0.7950
test's auc: 0.7643

设置 warm_start 参数增加新树¶

In [8]:
from sklearn.ensemble import GradientBoostingClassifier
In [9]:
# Create data stream
def get_minibatch(minibatch_size):
    path = '../../datasets/Home-Credit-Default-Risk/'
    return pd.read_csv(
        path + 'prepared_data.csv', 
        index_col='SK_ID_CURR', 
        chunksize = minibatch_size  # return iterator
    )
In [10]:
gbdt = GradientBoostingClassifier(
    learning_rate = 0.015,
    n_estimators = 0,
    subsample = 1.0,
    max_features = 0.35,
    max_depth = 8,
    n_iter_no_change = 20,
    warm_start = True,
    random_state = SEED,
    verbose = 0
)
In [11]:
def gbdt_with_warm_start(X_train, y_train, X_test, y_test, i=None):
    gbdt.n_estimators += 100
    gbdt.fit(X_train, y_train)
    train_pred = gbdt.predict_proba(X_train)[:, 1]
    test_pred = gbdt.predict_proba(X_test)[:, 1]
    return train_pred, test_pred, gbdt.n_estimators
In [12]:
def incremental_learning(iterator, refresh):
    test = iterator.get_chunk(size = 75000)
    print('test data shape:', test.shape)
    
    X_test = test.drop('TARGET', axis=1)
    y_test = test['TARGET']

    tick = time.time()
    n_train = 0
    auc_history = []
    
    # Main loop : iterate on mini-batches of examples
    for i, train in enumerate(iterator):
        X_train = train.drop('TARGET', axis = 1)
        y_train = train['TARGET']
        n_train += X_train.shape[0]
        
        # update model with examples in the current mini-batch
        train_pred, test_pred, num_trees = refresh(X_train, y_train, X_test, y_test, i)
        duration = time.time() - tick
        
        train_auc = roc_auc_score(y_train, train_pred)
        test_auc = roc_auc_score(y_test, test_pred)
        
        # report progress information
        if num_trees is None:
            num = n_train
            condition = f'{n_train} train samples'
        else:
            num = num_trees
            condition = f'{num_trees} trees'
        
        auc_history.append((num, train_auc, test_auc))
        progress = f"{condition}, valid's auc: {test_auc:.4f} in {duration:.2f}s" 
        print(progress)
    
    print(f"finally:")  
    print(f"  train's auc: {train_auc:.4f}") 
    print(f"  valid's auc: {test_auc:.4f}") 
    
    return auc_history
In [13]:
batch_size = 20000

minibatch_iterator = get_minibatch(minibatch_size = batch_size)
auc_history = incremental_learning(minibatch_iterator, gbdt_with_warm_start)
test data shape: (75000, 158)
100 trees, valid's auc: 0.7274 in 12.06s
200 trees, valid's auc: 0.7359 in 23.70s
300 trees, valid's auc: 0.7378 in 35.58s
400 trees, valid's auc: 0.7387 in 47.15s
500 trees, valid's auc: 0.7396 in 59.23s
600 trees, valid's auc: 0.7390 in 71.42s
700 trees, valid's auc: 0.7390 in 83.76s
800 trees, valid's auc: 0.7390 in 96.39s
900 trees, valid's auc: 0.7376 in 109.14s
1000 trees, valid's auc: 0.7375 in 122.00s
1100 trees, valid's auc: 0.7350 in 135.21s
1200 trees, valid's auc: 0.7316 in 144.29s
finally:
  train's auc: 0.8363
  valid's auc: 0.7316
In [14]:
def plot_score_evolution(auc_history, xlable):
    """xlabel: training examples (#) or num trees"""
    plt.figure()
    ticks, train_auc, test_auc = zip(*auc_history)
    plt.title('Metric during incremental learning') 
    plt.xlabel(xlable)
    plt.ylabel('auc')
    plt.grid(True)
    plt.plot(ticks, train_auc, label='train')
    plt.plot(ticks, test_auc, label='test')
    plt.gca()
    plt.legend(loc='best', title='auc')
In [15]:
plot_score_evolution(auc_history, 'num trees')
plt.show()
No description has been provided for this image

调用 partial_fit 方法刷新叶节点¶

In [16]:
from sklearn.linear_model import SGDClassifier
In [17]:
num_rounds = 1200  # 同上节 warm_start 树的数量

sgd = SGDClassifier(
    class_weight = {1: 11, 0: 1}, 
    loss='log_loss', 
    alpha = 0.01,
    max_iter = num_rounds,  
    penalty='elasticnet',
    l1_ratio = 0.5
)
In [18]:
def sgd_partial_fit(X_train, y_train, X_test, y_test, i=None):
    sgd.partial_fit(X_train, y_train, classes=[1, 0])
    train_pred = sgd.predict_proba(X_train)[:, 1]
    test_pred = sgd.predict_proba(X_test)[:, 1]
    return train_pred, test_pred, None

minibatch_iterator = get_minibatch(minibatch_size = batch_size)
auc_history = incremental_learning(minibatch_iterator, sgd_partial_fit)
test data shape: (75000, 158)
20000 train samples, valid's auc: 0.5038 in 0.19s
40000 train samples, valid's auc: 0.5174 in 0.45s
60000 train samples, valid's auc: 0.5004 in 0.71s
80000 train samples, valid's auc: 0.5284 in 0.97s
100000 train samples, valid's auc: 0.5014 in 1.22s
120000 train samples, valid's auc: 0.5002 in 1.46s
140000 train samples, valid's auc: 0.5175 in 1.71s
160000 train samples, valid's auc: 0.5003 in 1.98s
180000 train samples, valid's auc: 0.5461 in 2.22s
200000 train samples, valid's auc: 0.5458 in 2.45s
220000 train samples, valid's auc: 0.5323 in 2.73s
232511 train samples, valid's auc: 0.5009 in 2.91s
finally:
  train's auc: 0.5020
  valid's auc: 0.5009

运行结果展示:

In [19]:
plot_score_evolution(auc_history, 'training examples (#)')
plt.show()
No description has been provided for this image

XGBoost¶

XGBoost 提供两种增量学习的方式:

  • 一种是在当前迭代树的基础上增加新树,原树不变;
  • 一种是当前迭代树结构不变,重新计算叶节点权重和 / 或叶节点值。
In [20]:
import xgboost as xgb

设置 xgb_model 参数增加新树¶

In [21]:
# specify parameters via map
params = dict(
    booster = 'gbtree',
    objective = 'binary:logistic',
    eval_metric = 'auc',
    scale_pos_weight = 11,
    learning_rate = 0.015,
    max_depth = 8,
    subsample = 1.0,
    colsample_bytree = 0.35,
    reg_alpha = 65,
    reg_lambda = 15,
    seed = SEED,
    verbosity = 0
)
In [22]:
bst = None   # init model

# Train 1000 iterations, with the each chunk runs for 100 iterations.
def xgb_continue(X_train, y_train, X_test, y_test, i=None):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    global bst 
    bst = xgb.train(
        params,
        dtrain,
        num_boost_round = 100,
        xgb_model = bst,
        evals = [(dtrain, "train")],
        callbacks = [xgb.callback.EarlyStopping(20)],
        verbose_eval = 0
    )
    train_pred = bst.predict(dtrain)
    test_pred = bst.predict(dtest)
    return train_pred, test_pred, bst.num_boosted_rounds()

minibatch_iterator = get_minibatch(minibatch_size = batch_size)
auc_history = incremental_learning(minibatch_iterator, xgb_continue)
test data shape: (75000, 158)
100 trees, valid's auc: 0.7279 in 1.54s
200 trees, valid's auc: 0.7374 in 3.09s
300 trees, valid's auc: 0.7414 in 4.70s
400 trees, valid's auc: 0.7455 in 6.98s
500 trees, valid's auc: 0.7477 in 9.08s
600 trees, valid's auc: 0.7486 in 10.95s
700 trees, valid's auc: 0.7504 in 12.83s
800 trees, valid's auc: 0.7505 in 14.80s
900 trees, valid's auc: 0.7499 in 16.78s
1000 trees, valid's auc: 0.7505 in 18.87s
1100 trees, valid's auc: 0.7514 in 20.99s
1200 trees, valid's auc: 0.7518 in 22.87s
finally:
  train's auc: 0.7873
  valid's auc: 0.7518
In [23]:
plot_score_evolution(auc_history, 'num trees')
plt.show()
No description has been provided for this image

刷新叶子节点¶

使用 process_type 参数更新叶节点数值

In [24]:
# specify parameters via map
params = dict(
    process_type = 'default',   # Set `process_type` to `default` if you want to build new trees.
    booster = 'gbtree',
    objective = 'binary:logistic',
    eval_metric = 'auc',
    scale_pos_weight = 11,
    learning_rate = 0.015,
    max_depth = 8,
    subsample = 1.0,
    colsample_bytree = 0.35,
    reg_alpha = 65,
    reg_lambda = 15,
    seed = SEED,
    verbosity = 0
)
In [25]:
bst = None   # init model

# The model will adapt to new data by changing leaf value (no change in split condition)
def xgb_refresh(X_train, y_train, X_test, y_test, i=None):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    # update estimator with examples in the current mini-batch
    global bst 
    bst = xgb.train(
        params,
        dtrain,
        num_boost_round = num_rounds,
        xgb_model = bst,
        evals = [(dtrain, "train")],
        verbose_eval = 0
    )
    train_pred = bst.predict(dtrain)
    test_pred = bst.predict(dtest)

    if i == 0 :
        params['process_type'] = "update"
        params["updater"] =  "refresh"
        params["refresh_leaf"] = True  # Refresh the leaf value and tree statistic
    return train_pred, test_pred, None

minibatch_iterator = get_minibatch(minibatch_size = batch_size)
auc_history = incremental_learning(minibatch_iterator, xgb_refresh)
test data shape: (75000, 158)
20000 train samples, valid's auc: 0.7369 in 9.14s
40000 train samples, valid's auc: 0.7419 in 17.04s
60000 train samples, valid's auc: 0.7411 in 25.00s
80000 train samples, valid's auc: 0.7428 in 33.05s
100000 train samples, valid's auc: 0.7414 in 41.19s
120000 train samples, valid's auc: 0.7422 in 49.49s
140000 train samples, valid's auc: 0.7431 in 57.51s
160000 train samples, valid's auc: 0.7413 in 65.84s
180000 train samples, valid's auc: 0.7404 in 74.43s
200000 train samples, valid's auc: 0.7416 in 82.70s
220000 train samples, valid's auc: 0.7410 in 90.92s
232511 train samples, valid's auc: 0.7394 in 96.88s
finally:
  train's auc: 0.7557
  valid's auc: 0.7394
In [26]:
plot_score_evolution(auc_history, 'training examples (#)')
plt.show()
No description has been provided for this image

LightGBM¶

lightGBM 有两种方法控制增量学习模式:

  • 如果 init_model 不为 None,将从原有模型基础上继续训练,添加 num_boost_round 棵新树
  • 调用 refit 任务,将在原有模型的树结构都不变的基础上,重新拟合新数据更新叶子节点权重

设置 init_model 参数增加新树¶

In [27]:
import lightgbm as lgb
In [28]:
# specify parameters via map
params = dict(
    boosting_type = 'gbdt',
    objective = 'binary',
    metric = 'auc',
    is_unbalance = True,
    learning_rate = 0.015,
    max_depth = 8,
    feature_fraction = 0.35,
    bagging_fraction = 1.0,
    lambda_l1 = 65,
    lambda_l2 = 15,
    subsample_freq = 5,
    random_state = SEED,
    verbosity = -1
)
In [29]:
gbm = None   # init model

# Train 1200 iterations, with the each chunk runs for 100 iterations.
def lgb_continue(X_train, y_train, X_test, y_test, i=None):
    dtrain = lgb.Dataset(X_train, label=y_train)
    dtest = lgb.Dataset(X_test, label=y_test)
    global gbm
    gbm = lgb.train(
        params,
        dtrain,
        num_boost_round = 100,
        init_model = gbm,
        valid_sets = [dtrain],
        callbacks = [lgb.early_stopping(stopping_rounds=20)],
        keep_training_booster=True
    )
    train_pred = gbm.predict(X_train)
    test_pred = gbm.predict(X_test)
    return train_pred, test_pred, gbm.num_trees()

minibatch_iterator = get_minibatch(minibatch_size = batch_size)
auc_history = incremental_learning(minibatch_iterator, lgb_continue)
test data shape: (75000, 158)
100 trees, valid's auc: 0.7179 in 0.67s
200 trees, valid's auc: 0.7301 in 1.59s
300 trees, valid's auc: 0.7372 in 2.68s
400 trees, valid's auc: 0.7420 in 3.92s
500 trees, valid's auc: 0.7447 in 5.32s
600 trees, valid's auc: 0.7461 in 6.83s
700 trees, valid's auc: 0.7482 in 8.52s
800 trees, valid's auc: 0.7486 in 10.33s
900 trees, valid's auc: 0.7488 in 12.28s
1000 trees, valid's auc: 0.7494 in 14.39s
1100 trees, valid's auc: 0.7500 in 16.64s
1200 trees, valid's auc: 0.7506 in 18.64s
finally:
  train's auc: 0.7768
  valid's auc: 0.7506

其中 keep_training_booster (bool) 参数表示返回的模型 (booster) 是否将用于保持训练,默认 False。当模型非常大并导致内存错误时,可以尝试将此参数设置为 True,以避免 model_to_string 转换。然后仍然可以使用返回的 booster 作为 init_model,用于未来的继续训练。

In [30]:
plot_score_evolution(auc_history, 'num trees')
plt.show()
No description has been provided for this image

刷新叶子节点¶

In [31]:
# specify parameters via map
params = dict(
    boosting_type = 'gbdt',
    objective = 'binary',
    metric = 'auc',
    is_unbalance = True,
    learning_rate = 0.015,
    max_depth = 8,
    feature_fraction = 0.35,
    bagging_fraction = 1.0,
    lambda_l1 = 65,
    lambda_l2 = 15,
    subsample_freq = 5,
    random_state = SEED,
    verbosity = -1
)
In [32]:
gbm = None   # init model

# The model will adapt to new data by changing leaf value (no change in split condition)
def lgb_refit(X_train, y_train, X_test, y_test, i=None):
    dtrain = lgb.Dataset(X_train, label=y_train)
    dtest = lgb.Dataset(X_test, label=y_test)
    # update estimator with examples in the current mini-batch
    global gbm
    gbm = lgb.train(
        params,
        dtrain,
        num_boost_round = num_rounds,
        init_model = gbm,
        valid_sets = [dtrain],
        keep_training_booster=True
    )
    train_pred = gbm.predict(X_train)
    test_pred = gbm.predict(X_test)    
    if i == 0:
        params['task'] = 'refit'
        params['refit_decay_rate'] = 0.9
    return train_pred, test_pred, None

minibatch_iterator = get_minibatch(minibatch_size = batch_size)
auc_history = incremental_learning(minibatch_iterator, lgb_refit)
test data shape: (75000, 158)
20000 train samples, valid's auc: 0.7373 in 5.65s
40000 train samples, valid's auc: 0.7423 in 12.74s
60000 train samples, valid's auc: 0.7423 in 21.02s
80000 train samples, valid's auc: 0.7424 in 30.64s
100000 train samples, valid's auc: 0.7438 in 42.43s
120000 train samples, valid's auc: 0.7409 in 56.15s
140000 train samples, valid's auc: 0.7419 in 72.42s
160000 train samples, valid's auc: 0.7394 in 91.47s
180000 train samples, valid's auc: 0.7363 in 114.02s
200000 train samples, valid's auc: 0.7386 in 140.03s
220000 train samples, valid's auc: 0.7360 in 170.18s
232511 train samples, valid's auc: 0.7347 in 198.44s
finally:
  train's auc: 0.8629
  valid's auc: 0.7347

其中 refit_decay_rate 控制 refit 任务中叶节点输出的衰减率。重新拟合后,叶子结点的输出的计算公式为:
leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output

In [33]:
plot_score_evolution(auc_history, 'training examples (#)')
plt.show()
No description has been provided for this image