Scikit-learn¶
In [1]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import time
import warnings
# Setting configuration.
warnings.filterwarnings('ignore')
SEED = 42
In [2]:
path = '~/Documents/datasets/Home-Credit-Default-Risk.csv'
data = pd.read_csv(path, index_col='SK_ID_CURR')
categorical_features = [col for col in data.columns if data[col].dtype in ("object", "category")]
data[categorical_features] = data[categorical_features].astype("category")
In [3]:
X_train, X_test, y_train, y_test = train_test_split(
data.drop('TARGET', axis=1),
data['TARGET'],
test_size=0.25,
random_state=SEED
)
In [4]:
gbdt = HistGradientBoostingClassifier(
loss = 'log_loss',
max_iter = 500,
learning_rate = 0.1,
max_features = 0.7,
max_depth = 6,
l2_regularization = 10,
categorical_features='from_dtype',
scoring = "roc_auc",
early_stopping='auto',
n_iter_no_change = 20,
class_weight = "balanced",
random_state = SEED,
verbose = 0
)
click = time.time()
gbdt.fit(X_train, y_train)
print(f'Cost {time.time() - click:.0f} seconds.')
Cost 15 seconds.
In [5]:
train_auc = roc_auc_score(y_train, gbdt.predict_proba(X_train)[:, 1])
test_auc = roc_auc_score(y_test, gbdt.predict_proba(X_test)[:, 1])
print(f"train's auc: {train_auc:.6f}")
print(f"test's auc: {test_auc:.6f}")
train's auc: 0.810746 test's auc: 0.755827
In [6]:
plt.plot(gbdt.train_score_, label="train")
plt.plot(gbdt.validation_score_, label="validation")
plt.legend()
plt.title('Metric during training')
plt.xlabel('Iterations')
plt.ylabel("auc")
Out[6]:
Text(0, 0.5, 'auc')
XGBoost¶
In [7]:
import xgboost as xgb
In [8]:
path = '~/Documents/datasets/Home-Credit-Default-Risk.csv'
data = pd.read_csv(path, index_col='SK_ID_CURR')
categorical_features = [col for col in data.columns if data[col].dtype in ("object", "category")]
data[categorical_features] = data[categorical_features].astype("category")
In [9]:
X_train, X_test, y_train, y_test = train_test_split(
data.drop('TARGET', axis=1),
data['TARGET'],
test_size=0.25,
random_state=SEED
)
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)
In [10]:
# specify parameters via map
params = dict(
booster = 'gbtree',
objective = 'binary:logistic',
eval_metric = 'auc',
scale_pos_weight = 11,
learning_rate = 0.1,
max_depth = 6,
subsample = 0.7,
colsample_bytree = 0.5,
reg_alpha = 10,
reg_lambda = 10,
seed = SEED
)
In [11]:
# Training a model requires a parameter list and data set:
evals_result = {}
callbacks = [
xgb.callback.EvaluationMonitor(period=100, show_stdv=True),
xgb.callback.EarlyStopping(20)
]
click = time.time()
bst = xgb.train(
params,dtrain,
num_boost_round = 500,
evals = [(dtest, "eval"), (dtrain, "train")],
evals_result = evals_result,
callbacks = callbacks,
verbose_eval = 0
)
print(f'Cost {time.time() - click:.0f} seconds.')
[0] eval-auc:0.62172 train-auc:0.64208 [100] eval-auc:0.75551 train-auc:0.81036 [200] eval-auc:0.75484 train-auc:0.84563 [300] eval-auc:0.75352 train-auc:0.87289 [400] eval-auc:0.75114 train-auc:0.89479 [499] eval-auc:0.74911 train-auc:0.91158 Cost 59 seconds.
In [12]:
def plot_metric(evals_result, metric):
for label in evals_result:
plt.plot(evals_result[label][metric], label=label)
plt.legend()
plt.title('Metric during training')
plt.xlabel('Iterations')
plt.ylabel(metric)
plot_metric(evals_result, metric='auc')
In [13]:
feature_imp = pd.Series(bst.get_fscore())
print(feature_imp.sort_values(ascending=False).head(10))
ORGANIZATION_TYPE 1545.0 EXT_SOURCE_3 1252.0 EXT_SOURCE_2 1176.0 OCCUPATION_TYPE 1146.0 DAYS_ID_PUBLISH 1053.0 DAYS_BIRTH 993.0 EXT_SOURCE_1 981.0 DAYS_REGISTRATION 974.0 AMT_ANNUITY 968.0 AMT_CREDIT 840.0 dtype: float64
LightGBM¶
In [14]:
import lightgbm as lgb
In [15]:
path = '~/Documents/datasets/Home-Credit-Default-Risk.csv'
data = pd.read_csv(path, index_col='SK_ID_CURR')
categorical_features = [col for col in data.columns if data[col].dtype in ("object", "category")]
data[categorical_features] = data[categorical_features].astype("category")
In [16]:
X_train, X_test, y_train, y_test = train_test_split(
data.drop('TARGET', axis=1),
data['TARGET'],
test_size=0.25,
random_state=SEED
)
dtrain = lgb.Dataset(X_train, label=y_train)
dtest = lgb.Dataset(X_test, label=y_test, reference=dtrain)
In [17]:
params = dict(
boosting_type = 'gbdt',
objective = 'binary',
metric = 'auc',
is_unbalance = True,
num_boost_round = 500,
learning_rate = 0.1,
max_depth = 6,
feature_fraction = 0.5,
bagging_fraction = 0.7,
lambda_l1 = 15,
lambda_l2 = 10,
subsample_freq = 5,
random_state = SEED,
verbosity = 0
)
In [18]:
eval_results = {} # to record eval results for plotting
callbacks = [
lgb.log_evaluation(period=100),
lgb.early_stopping(stopping_rounds=20),
lgb.record_evaluation(eval_results)
]
# Training
bst = lgb.train(
params,
dtrain,
valid_sets = [dtest, dtrain],
valid_names = ["eval", "train"],
callbacks = callbacks
)
[LightGBM] [Warning] Provided parameters constrain tree depth (max_depth=6) without explicitly setting 'num_leaves'. This can lead to underfitting. To resolve this warning, pass 'num_leaves' (<=64) in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity. [LightGBM] [Warning] Provided parameters constrain tree depth (max_depth=6) without explicitly setting 'num_leaves'. This can lead to underfitting. To resolve this warning, pass 'num_leaves' (<=64) in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity. [LightGBM] [Warning] Provided parameters constrain tree depth (max_depth=6) without explicitly setting 'num_leaves'. This can lead to underfitting. To resolve this warning, pass 'num_leaves' (<=64) in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity. Training until validation scores don't improve for 20 rounds [100] train's auc: 0.794883 eval's auc: 0.756161 Early stopping, best iteration is: [165] train's auc: 0.812678 eval's auc: 0.75743
In [19]:
# Plotting metrics recorded during training
lgb.plot_metric(eval_results, metric='auc')
plt.show()
In [20]:
feature_imp = pd.Series(
bst.feature_importance(),
index = bst.feature_name()
)
print(feature_imp.sort_values(ascending=False).head(10))
ORGANIZATION_TYPE 685 OCCUPATION_TYPE 269 EXT_SOURCE_3 261 EXT_SOURCE_1 215 DAYS_BIRTH 191 EXT_SOURCE_2 182 AMT_GOODS_PRICE 180 AMT_ANNUITY 176 AMT_CREDIT 166 DAYS_ID_PUBLISH 150 dtype: int32
CatBoost¶
In [21]:
import catboost as cb
from catboost import CatBoostClassifier
In [22]:
path = '~/Documents/datasets/Home-Credit-Default-Risk.csv'
data = pd.read_csv(path, index_col='SK_ID_CURR')
cat_features = [col for col in data.columns if data[col].dtype in ("category", "object")]
data[cat_features] = data[cat_features].fillna("_catboost_unknow")
In [23]:
X_train, X_test, y_train, y_test = train_test_split(
data.drop('TARGET', axis=1),
data['TARGET'],
test_size=0.25,
random_state=SEED
)
dtrain = cb.Pool(X_train, label=y_train, cat_features=cat_features)
dtest = cb.Pool(X_test, label=y_test, cat_features=cat_features)
In [24]:
params = dict(
loss_function = "Logloss",
eval_metric = "AUC",
allow_writing_files = False,
auto_class_weights = "Balanced",
use_best_model = True,
depth = 6,
iterations = 500,
learning_rate = 0.1,
subsample = 0.7,
random_strength = 1.0,
bagging_temperature = 1.0,
sampling_frequency = "PerTree",
l2_leaf_reg = 10,
grow_policy = "SymmetricTree"
)
In [25]:
clf = CatBoostClassifier(**params)
click = time.time()
clf.fit(dtrain, eval_set=[dtrain, dtest], early_stopping_rounds=20, use_best_model=True, verbose=100)
print(f'Cost {time.time() - click:.0f} seconds.')
0: test: 0.6919355 test1: 0.6911798 best: 0.6911798 (0) total: 767ms remaining: 6m 22s 100: test: 0.7614232 test1: 0.7517401 best: 0.7517401 (100) total: 50.3s remaining: 3m 18s 200: test: 0.7792934 test1: 0.7591927 best: 0.7591927 (200) total: 1m 40s remaining: 2m 29s 300: test: 0.7949443 test1: 0.7608856 best: 0.7609248 (282) total: 2m 35s remaining: 1m 42s Stopped by overfitting detector (20 iterations wait) bestTest = 0.7609248209 bestIteration = 282 Shrink model to first 283 iterations. Cost 157 seconds.
In [26]:
train_auc = roc_auc_score(y_train, clf.predict_proba(X_train)[:, 1])
test_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
print(f"train's auc: {train_auc:.6f}")
print(f"test's auc: {test_auc:.6f}")
train's auc: 0.792471 test's auc: 0.760925
In [27]:
evals_result = clf.get_evals_result()
evals_result['learn'] = evals_result.pop('validation_0')
lgb.plot_metric(evals_result, metric='AUC')
plt.show()
In [28]:
clf.get_feature_importance(prettified=True).set_index("Feature Id").head(10)
Out[28]:
Importances | |
---|---|
Feature Id | |
EXT_SOURCE_3 | 17.871654 |
EXT_SOURCE_2 | 10.726697 |
EXT_SOURCE_1 | 6.637747 |
AMT_CREDIT | 5.673753 |
AMT_GOODS_PRICE | 5.296537 |
DAYS_BIRTH | 5.073781 |
AMT_ANNUITY | 4.297499 |
DAYS_EMPLOYED | 3.571174 |
DAYS_LAST_PHONE_CHANGE | 2.310004 |
DAYS_ID_PUBLISH | 2.261567 |
In [ ]: