import pandas as pd
import numpy as np
import copy
import json
import pickle
import joblib
import lightgbm as lgb
import optuna
import warnings
import gc

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import roc_auc_score
from sklearn.base import clone
import matplotlib.pyplot as plt
import seaborn as sns

# Setting configuration.
pd.set_option('display.float_format', lambda x: '%.5f' %x)
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
optuna.logging.set_verbosity(optuna.logging.WARNING)

SEED = 42

print('Loading data...')
path = '../datasets/Home-Credit-Default-Risk/selected_data.csv'
df = pd.read_csv(path, index_col='SK_ID_CURR')

Loading data...

# Split data into training and testing sets
X_train, X_valid, y_train, y_valid = train_test_split(
    df.drop(columns="TARGET"), 
    df["TARGET"], 
    test_size=0.25, 
    random_state=SEED
)

print("X_train shape:", X_train.shape)
print('train:', y_train.value_counts(), sep='\n') 
print('valid:', y_valid.value_counts(), sep='\n')

X_train shape: (230633, 835)
train:
TARGET
0    211999
1     18634
Name: count, dtype: int64
valid:
TARGET
0    70687
1     6191
Name: count, dtype: int64

# Specific feature names and categorical features
feature_name = X_train.columns.tolist()
categorical_feature = X_train.select_dtypes(object).columns.tolist()

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

# Encode categorical features
encoder = make_column_transformer(
    (OneHotEncoder(
        drop='if_binary', 
        min_frequency=0.02, 
        max_categories=20, 
        sparse_output=False,
        handle_unknown='ignore'
    ), categorical_feature),
    remainder='passthrough', 
    verbose_feature_names_out=False,
    verbose=True    
)

print('fitting...')
encoder.fit(X_train)

print('encoding...')
train_dummies = encoder.transform(X_train)
valid_dummies = encoder.transform(X_valid)
print('train data shape:', X_train.shape)

fitting...
[ColumnTransformer] . (1 of 2) Processing onehotencoder, total=   4.7s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s
encoding...
train data shape: (230633, 835)

del df, X_train, X_valid
gc.collect()

2948

# Define a cross validation strategy
# We use the cross_val_score function of Sklearn. 
# However this function has not a shuffle attribute, we add then one line of code, 
# in order to shuffle the dataset prior to cross-validation

def evaluate(model, X, y, n_folds = 5, verbose=True):
    kf = KFold(n_folds, shuffle=True, random_state=SEED).get_n_splits(X)
    scores = cross_val_score(
        model, 
        X, 
        y, 
        scoring="roc_auc", 
        cv = kf
    )
    if verbose:
        print(f"valid auc: {scores.mean():.3f} +/- {scores.std():.3f}")
    return scores.mean()

class Objective:
    estimators = (
  	    LogisticRegression, 
  	    SGDClassifier, 
  	    GaussianNB, 
  	    RandomForestClassifier, 
  	    HistGradientBoostingClassifier
  	)
    def __init__(self, estimator, X, y):
  	    # assert isinstance(estimator, estimators), f"estimator must be one of {estimators}"
        self.model = estimator
        self.X = X
        self.y = y
    
    def __call__(self, trial):
        # Create hyperparameter space
        if isinstance(self.model, LogisticRegression): 
            search_space = dict(
                class_weight = 'balanced', 
                C = trial.suggest_float('C', 0.01, 100.0, log=True),
                l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0)  # The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1.
            )
        
        
        elif isinstance(self.model, SGDClassifier): 
            search_space = dict(
                class_weight = 'balanced', 
                loss = trial.suggest_categorical('loss', ['hinge', 'log_loss', 'modified_huber']), 
                alpha = trial.suggest_float('alpha', 1e-5, 10.0, log=True),
                penalty = 'elasticnet',
                l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0),
                early_stopping = True
            )
        
        elif isinstance(self.model, GaussianNB): 
            search_space = dict(
            priors = None
            )
        
        elif isinstance(self.model, RandomForestClassifier): 
            search_space = dict(
                class_weight = 'balanced', 
                n_estimators = trial.suggest_int('n_estimators', 50, 500, step=50),
                max_depth = trial.suggest_int('max_depth', 2, 20),
                max_features = trial.suggest_float('max_features', 0.2, 0.9),
                random_state = SEED
            )
        
        elif isinstance(self.model, HistGradientBoostingClassifier): 
            search_space = dict(
                class_weight = 'balanced', 
                learning_rate = trial.suggest_float('learning_rate', 1e-3, 10.0, log=True),
                max_iter = trial.suggest_int('max_iter', 50, 500, step=50),
                max_depth = trial.suggest_int('max_depth', 2, 20),
                max_features = trial.suggest_float('max_features', 0.2, 0.9),
                l2_regularization = trial.suggest_float('l2_regularization', 1e-3, 10.0, log=True),
                random_state = SEED,
                verbose = 0
            )
    
        # Setting hyperparameters
        self.model.set_params(**search_space) 
    
        # Training with 5-fold CV:
        score = evaluate(self.model, self.X, self.y)
        return score

def timer(func):
    import time
    import functools
    def strfdelta(tdelta, fmt):
        hours, remainder = divmod(tdelta, 3600)
        minutes, seconds = divmod(remainder, 60)
        return fmt.format(hours, minutes, seconds)
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        click = time.time()
        result = func(*args, **kwargs)
        delta = strfdelta(time.time() - click, "{:.0f} hours {:.0f} minutes {:.0f} seconds")
        print(f"{func.__name__} cost time {delta}")
        return result
    return wrapper

# Creating a pipeline & Hyperparameter tuning

@timer
def tuning(model, X, y):
    # create a study object
    study = optuna.create_study(direction="maximize")
    # Invoke optimization of the objective function.
    objective = Objective(model, X, y)
    study.optimize(
        objective, 
        n_trials = 50,
        timeout = 2400,
        gc_after_trial = True,
        show_progress_bar = True
    )
    print(model, 'best score:', study.best_value) 
    return study

Objective.estimators

(sklearn.linear_model._logistic.LogisticRegression,
 sklearn.linear_model._stochastic_gradient.SGDClassifier,
 sklearn.naive_bayes.GaussianNB,
 sklearn.ensemble._forest.RandomForestClassifier,
 sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier)

# opt_results = []
# for model in Objective.estimators:
#     study = tuning(model(), train_dummies, y_train)
#     opt_results.append(study)
#     print(model)
#     print(study.best_trial.params)

# define the search space and the objecive function
def stacking_obj(trial):
    stacking = StackingClassifier(
        # The `estimators` parameter corresponds to the list of the estimators which are stacked.
        estimators = [
            ('Logit', LogisticRegression(
                class_weight = 'balanced', 
                C = trial.suggest_float('Logit__C', 0.01, 100.0, log=True),
                l1_ratio = trial.suggest_float('Logit__l1_ratio', 0.0, 1.0)  # The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1.
            )),
            ('SGD', SGDClassifier(
                class_weight = 'balanced', 
                loss = trial.suggest_categorical('SGD__loss', ['hinge', 'log_loss', 'modified_huber']), 
                alpha = trial.suggest_float('SGD__alpha', 1e-5, 10.0, log=True),
                penalty = 'elasticnet',
                l1_ratio = trial.suggest_float('SGD__l1_ratio', 0.0, 1.0),
                early_stopping = True
            )),
            ('GaussianNB', GaussianNB())
        ],
        # The final_estimator will use the predictions of the estimators as input
        final_estimator = LogisticRegression(
            class_weight = 'balanced', 
            C = trial.suggest_float('final__C', 0.01, 100.0, log=True),
            # The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1.
            l1_ratio = trial.suggest_float('final__l1_ratio', 0.0, 1.0)  
        ),
        verbose = 1
    )
    score = evaluate(stacking, train_dummies, y_train, n_folds = 3)
    return score

# create a study object.
study = optuna.create_study(
    study_name = 'stacking-study',  # Unique identifier of the study.
    direction = 'maximize'
)

# Invoke optimization of the objective function.
study.optimize(
    stacking_obj, 
    n_trials = 100, 
    timeout = 3600,
    gc_after_trial = True,
    show_progress_bar = True
)

  0%|          | 0/100 [00:00<?, ?it/s]

valid auc: 0.676 +/- 0.017
valid auc: 0.669 +/- 0.021
valid auc: 0.673 +/- 0.016
valid auc: 0.451 +/- 0.121
valid auc: 0.592 +/- 0.045
valid auc: 0.666 +/- 0.017
valid auc: 0.675 +/- 0.014
valid auc: 0.666 +/- 0.021
valid auc: 0.672 +/- 0.016
valid auc: 0.667 +/- 0.021
valid auc: 0.672 +/- 0.012

joblib.dump(study, path + "stacking-study.pkl")

study = joblib.load(path + "stacking-study.pkl")

print("Best trial until now:")
print(" Value: ", study.best_trial.value)
print(" Params: ")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

Best trial until now:
 Value:  0.6761396385434888
 Params: 
    Logit__C: 0.020329668727865235
    Logit__l1_ratio: 0.5165207006926232
    SGD__loss: modified_huber
    SGD__alpha: 1.6638099778831132
    SGD__l1_ratio: 0.7330208370976262
    final__C: 14.1468564043383
    final__l1_ratio: 0.4977751012657087

stacking = StackingClassifier(
    # The `estimators` parameter corresponds to the list of the estimators which are stacked.
    estimators = [
        ('Logit', LogisticRegression(
            class_weight = 'balanced', 
            C = 0.020329668727865235,
            l1_ratio = 0.5165207006926232  # The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1.
        )),
        ('SGD', SGDClassifier(
            class_weight = 'balanced', 
            loss = 'modified_huber', 
            alpha = 1.6638099778831132,
            penalty = 'elasticnet',
            l1_ratio = 0.7330208370976262,
            early_stopping = True
        )),
        ('GaussianNB', GaussianNB())
    ],
    # The final_estimator will use the predictions of the estimators as input
    final_estimator = LogisticRegression(
        class_weight = 'balanced', 
        C = 14.1468564043383,
        # The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1.
        l1_ratio = 0.4977751012657087 
    ),
    verbose = 1
)

score = evaluate(stacking, train_dummies, y_train)

valid auc: 0.674 +/- 0.009

stacking.fit(train_dummies, y_train)

train_auc = roc_auc_score(y_train, stacking.predict_proba(train_dummies)[:, 1])
valid_auc = roc_auc_score(y_valid, stacking.predict_proba(valid_dummies)[:, 1])
print('train auc:', train_auc)
print('valid auc:', valid_auc)

train auc: 0.6753919322392181
valid auc: 0.6752015627178207

Ensembles¶

创建数据集¶

创建优化器¶

超参数优化¶

模型训练¶