Scikit-learn¶

In [1]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import time
import warnings

# Setting configuration.
warnings.filterwarnings('ignore')

SEED = 42
In [2]:
path = '~/Documents/datasets/Home-Credit-Default-Risk.csv'
data = pd.read_csv(path, index_col='SK_ID_CURR')

categorical_features = [col for col in data.columns if data[col].dtype in ("object", "category")]
data[categorical_features] = data[categorical_features].astype("category")
In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('TARGET', axis=1), 
    data['TARGET'], 
    test_size=0.25,
    random_state=SEED
)
In [4]:
gbdt = HistGradientBoostingClassifier(
    loss = 'log_loss',
    max_iter = 500, 
    learning_rate = 0.1,
    max_features = 0.7,
    max_depth = 6,
    l2_regularization = 10,
    categorical_features='from_dtype',
    scoring = "roc_auc",
    early_stopping='auto',
    n_iter_no_change = 20,
    class_weight = "balanced",
    random_state = SEED,
    verbose = 0
)

click = time.time()
gbdt.fit(X_train, y_train)
print(f'Cost {time.time() - click:.0f} seconds.')
Cost 15 seconds.
In [5]:
train_auc = roc_auc_score(y_train, gbdt.predict_proba(X_train)[:, 1])
test_auc = roc_auc_score(y_test, gbdt.predict_proba(X_test)[:, 1])
print(f"train's auc: {train_auc:.6f}")
print(f"test's auc: {test_auc:.6f}")
train's auc: 0.810746
test's auc: 0.755827
In [6]:
plt.plot(gbdt.train_score_, label="train")
plt.plot(gbdt.validation_score_, label="validation")
plt.legend()
plt.title('Metric during training')
plt.xlabel('Iterations')
plt.ylabel("auc")   
Out[6]:
Text(0, 0.5, 'auc')
No description has been provided for this image

XGBoost¶

In [7]:
import xgboost as xgb
In [8]:
path = '~/Documents/datasets/Home-Credit-Default-Risk.csv'
data = pd.read_csv(path, index_col='SK_ID_CURR')

categorical_features = [col for col in data.columns if data[col].dtype in ("object", "category")]
data[categorical_features] = data[categorical_features].astype("category")
In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('TARGET', axis=1), 
    data['TARGET'], 
    test_size=0.25,
    random_state=SEED
)

dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True) 
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)
In [10]:
# specify parameters via map
params = dict(
    booster = 'gbtree',
    objective = 'binary:logistic',
    eval_metric = 'auc',
    scale_pos_weight = 11,
    learning_rate = 0.1,
    max_depth = 6,
    subsample = 0.7,
    colsample_bytree = 0.5,
    reg_alpha = 10,
    reg_lambda = 10,
    seed = SEED
)
In [11]:
# Training a model requires a parameter list and data set:
evals_result = {}
callbacks = [
    xgb.callback.EvaluationMonitor(period=100, show_stdv=True),
    xgb.callback.EarlyStopping(20)
]

click = time.time()
bst = xgb.train(
    params,dtrain,
    num_boost_round = 500,
    evals = [(dtest, "eval"), (dtrain, "train")],
    evals_result = evals_result,
    callbacks = callbacks,
    verbose_eval = 0
) 
print(f'Cost {time.time() - click:.0f} seconds.')
[0]	eval-auc:0.62172	train-auc:0.64208
[100]	eval-auc:0.75551	train-auc:0.81036
[200]	eval-auc:0.75484	train-auc:0.84563
[300]	eval-auc:0.75352	train-auc:0.87289
[400]	eval-auc:0.75114	train-auc:0.89479
[499]	eval-auc:0.74911	train-auc:0.91158
Cost 59 seconds.
In [12]:
def plot_metric(evals_result, metric):
    for label in evals_result:
        plt.plot(evals_result[label][metric], label=label)
    plt.legend()
    plt.title('Metric during training')
    plt.xlabel('Iterations')
    plt.ylabel(metric)   

plot_metric(evals_result, metric='auc')
No description has been provided for this image
In [13]:
feature_imp = pd.Series(bst.get_fscore())
print(feature_imp.sort_values(ascending=False).head(10))
ORGANIZATION_TYPE    1545.0
EXT_SOURCE_3         1252.0
EXT_SOURCE_2         1176.0
OCCUPATION_TYPE      1146.0
DAYS_ID_PUBLISH      1053.0
DAYS_BIRTH            993.0
EXT_SOURCE_1          981.0
DAYS_REGISTRATION     974.0
AMT_ANNUITY           968.0
AMT_CREDIT            840.0
dtype: float64

LightGBM¶

In [14]:
import lightgbm as lgb
In [15]:
path = '~/Documents/datasets/Home-Credit-Default-Risk.csv'
data = pd.read_csv(path, index_col='SK_ID_CURR')

categorical_features = [col for col in data.columns if data[col].dtype in ("object", "category")]
data[categorical_features] = data[categorical_features].astype("category")
In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('TARGET', axis=1), 
    data['TARGET'], 
    test_size=0.25,
    random_state=SEED
)

dtrain = lgb.Dataset(X_train, label=y_train)
dtest = lgb.Dataset(X_test, label=y_test, reference=dtrain)
In [17]:
params = dict(
    boosting_type = 'gbdt',
    objective = 'binary',
    metric = 'auc',
    is_unbalance = True,
    num_boost_round = 500,
    learning_rate = 0.1,
    max_depth = 6,
    feature_fraction = 0.5,
    bagging_fraction = 0.7,
    lambda_l1 = 15,
    lambda_l2 = 10,
    subsample_freq = 5,
    random_state = SEED,
    verbosity = 0
)
In [18]:
eval_results = {} # to record eval results for plotting
callbacks = [
    lgb.log_evaluation(period=100), 
    lgb.early_stopping(stopping_rounds=20),
    lgb.record_evaluation(eval_results)
]

# Training
bst = lgb.train(
    params, 
    dtrain, 
    valid_sets = [dtest, dtrain],
    valid_names = ["eval", "train"],
    callbacks = callbacks
)
[LightGBM] [Warning] Provided parameters constrain tree depth (max_depth=6) without explicitly setting 'num_leaves'. This can lead to underfitting. To resolve this warning, pass 'num_leaves' (<=64) in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity.
[LightGBM] [Warning] Provided parameters constrain tree depth (max_depth=6) without explicitly setting 'num_leaves'. This can lead to underfitting. To resolve this warning, pass 'num_leaves' (<=64) in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity.
[LightGBM] [Warning] Provided parameters constrain tree depth (max_depth=6) without explicitly setting 'num_leaves'. This can lead to underfitting. To resolve this warning, pass 'num_leaves' (<=64) in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity.
Training until validation scores don't improve for 20 rounds
[100]	train's auc: 0.794883	eval's auc: 0.756161
Early stopping, best iteration is:
[165]	train's auc: 0.812678	eval's auc: 0.75743
In [19]:
# Plotting metrics recorded during training
lgb.plot_metric(eval_results, metric='auc')
plt.show()
No description has been provided for this image
In [20]:
feature_imp = pd.Series(
    bst.feature_importance(),
    index = bst.feature_name()
)

print(feature_imp.sort_values(ascending=False).head(10))
ORGANIZATION_TYPE    685
OCCUPATION_TYPE      269
EXT_SOURCE_3         261
EXT_SOURCE_1         215
DAYS_BIRTH           191
EXT_SOURCE_2         182
AMT_GOODS_PRICE      180
AMT_ANNUITY          176
AMT_CREDIT           166
DAYS_ID_PUBLISH      150
dtype: int32

CatBoost¶

In [21]:
import catboost as cb
from catboost import CatBoostClassifier
In [22]:
path = '~/Documents/datasets/Home-Credit-Default-Risk.csv'
data = pd.read_csv(path, index_col='SK_ID_CURR')

cat_features = [col for col in data.columns if data[col].dtype in ("category", "object")]
data[cat_features] = data[cat_features].fillna("_catboost_unknow")
In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('TARGET', axis=1), 
    data['TARGET'], 
    test_size=0.25,
    random_state=SEED
)

dtrain = cb.Pool(X_train, label=y_train, cat_features=cat_features)
dtest = cb.Pool(X_test, label=y_test, cat_features=cat_features)
In [24]:
params = dict(
    loss_function = "Logloss",
    eval_metric = "AUC",
    allow_writing_files = False,
    auto_class_weights = "Balanced",
    use_best_model = True,
    depth = 6,
    iterations = 500,
    learning_rate = 0.1,
    subsample = 0.7,
    random_strength = 1.0,
    bagging_temperature = 1.0,
    sampling_frequency = "PerTree",
    l2_leaf_reg = 10,
    grow_policy = "SymmetricTree"
)
In [25]:
clf = CatBoostClassifier(**params)

click = time.time()
clf.fit(dtrain, eval_set=[dtrain, dtest], early_stopping_rounds=20, use_best_model=True, verbose=100)
print(f'Cost {time.time() - click:.0f} seconds.')
0:	test: 0.6919355	test1: 0.6911798	best: 0.6911798 (0)	total: 767ms	remaining: 6m 22s
100:	test: 0.7614232	test1: 0.7517401	best: 0.7517401 (100)	total: 50.3s	remaining: 3m 18s
200:	test: 0.7792934	test1: 0.7591927	best: 0.7591927 (200)	total: 1m 40s	remaining: 2m 29s
300:	test: 0.7949443	test1: 0.7608856	best: 0.7609248 (282)	total: 2m 35s	remaining: 1m 42s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7609248209
bestIteration = 282

Shrink model to first 283 iterations.
Cost 157 seconds.
In [26]:
train_auc = roc_auc_score(y_train, clf.predict_proba(X_train)[:, 1])
test_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
print(f"train's auc: {train_auc:.6f}")
print(f"test's auc: {test_auc:.6f}")
train's auc: 0.792471
test's auc: 0.760925
In [27]:
evals_result = clf.get_evals_result()
evals_result['learn'] = evals_result.pop('validation_0')
lgb.plot_metric(evals_result, metric='AUC')
plt.show()
No description has been provided for this image
In [28]:
clf.get_feature_importance(prettified=True).set_index("Feature Id").head(10)
Out[28]:
Importances
Feature Id
EXT_SOURCE_3 17.871654
EXT_SOURCE_2 10.726697
EXT_SOURCE_1 6.637747
AMT_CREDIT 5.673753
AMT_GOODS_PRICE 5.296537
DAYS_BIRTH 5.073781
AMT_ANNUITY 4.297499
DAYS_EMPLOYED 3.571174
DAYS_LAST_PHONE_CHANGE 2.310004
DAYS_ID_PUBLISH 2.261567
In [ ]: