import numpy as np
import pandas as pd
import re
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.pipeline import FeatureUnion, make_union, Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc

# Setting configuration.
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

SEED = 42

def timer(func):
    import time
    import functools
    def strfdelta(tdelta, fmt):
        hours, remainder = divmod(tdelta, 3600)
        minutes, seconds = divmod(remainder, 60)
        return fmt.format(hours, minutes, seconds)
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        click = time.time()
        result = func(*args, **kwargs)
        delta = strfdelta(time.time() - click, "{:.0f} hours {:.0f} minutes {:.0f} seconds")
        print(f"{func.__name__} cost time {delta}")
        return result
    return wrapper

df = pd.read_csv('../datasets/Home-Credit-Default-Risk/application_train.csv')

if df['SK_ID_CURR'].nunique() < df.shape[0]:
    df = df.drop_duplicates(subset=['SK_ID_CURR'], keep='last')

df.dtypes.value_counts()

float64    65
int64      41
object     16
Name: count, dtype: int64

df.apply(pd.to_numeric,  errors='ignore').dtypes.value_counts()

float64    65
int64      41
object     16
Name: count, dtype: int64

# df['HOUR_APPR_PROCESS_START '] = df['HOUR_APPR_PROCESS_START'].astype(str)

(df['DAYS_BIRTH'] / -365).describe()

count    307511.000000
mean         43.936973
std          11.956133
min          20.517808
25%          34.008219
50%          43.150685
75%          53.923288
max          69.120548
Name: DAYS_BIRTH, dtype: float64

for feature in ['DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH']:
        print(f'{feature} info: ')
        print((df[feature] / -365).describe() )

DAYS_BIRTH info: 
count    307511.000000
mean         43.936973
std          11.956133
min          20.517808
25%          34.008219
50%          43.150685
75%          53.923288
max          69.120548
Name: DAYS_BIRTH, dtype: float64
DAYS_EMPLOYED info: 
count    307511.000000
mean       -174.835742
std         387.056895
min       -1000.665753
25%           0.791781
50%           3.323288
75%           7.561644
max          49.073973
Name: DAYS_EMPLOYED, dtype: float64
DAYS_REGISTRATION info: 
count    307511.000000
mean         13.660604
std           9.651743
min          -0.000000
25%           5.506849
50%          12.339726
75%          20.491781
max          67.594521
Name: DAYS_REGISTRATION, dtype: float64
DAYS_ID_PUBLISH info: 
count    307511.000000
mean          8.203294
std           4.135481
min          -0.000000
25%           4.712329
50%           8.915068
75%          11.778082
max          19.717808
Name: DAYS_ID_PUBLISH, dtype: float64

pd.cut(df['DAYS_EMPLOYED'] / -365, bins=5).value_counts().sort_index()

DAYS_EMPLOYED
(-1001.715, -790.718]     55374
(-790.718, -580.77]           0
(-580.77, -370.822]           0
(-370.822, -160.874]          0
(-160.874, 49.074]       252137
Name: count, dtype: int64

# Replace the anomalous values with nan
df_emp = df['DAYS_EMPLOYED'].where(df['DAYS_EMPLOYED'].abs()<365243, np.nan)

df_emp.plot.hist(title = 'Days Employment Histogram')
plt.xlabel('Days Employment')

Text(0.5, 0, 'Days Employment')

df.map(lambda x:x=='XNA').sum().sum()/df.size

0.0014761034004861135

for col in df.select_dtypes(exclude="number").columns:
    if df[col].nunique() == 2:
        print(df[col].value_counts())

NAME_CONTRACT_TYPE
Cash loans         278232
Revolving loans     29279
Name: count, dtype: int64
FLAG_OWN_CAR
N    202924
Y    104587
Name: count, dtype: int64
FLAG_OWN_REALTY
Y    213312
N     94199
Name: count, dtype: int64
EMERGENCYSTATE_MODE
No     159428
Yes      2328
Name: count, dtype: int64

df[['FLAG_OWN_CAR', 'FLAG_OWN_REALTY']].replace({'Y': 1, 'N': 0}).value_counts()

FLAG_OWN_CAR  FLAG_OWN_REALTY
0             1                  140952
1             1                   72360
0             0                   61972
1             0                   32227
Name: count, dtype: int64

df['EMERGENCYSTATE_MODE'].replace({'Yes': 1, 'No': 0}).value_counts()

EMERGENCYSTATE_MODE
0.0    159428
1.0      2328
Name: count, dtype: int64

id_col = "SK_ID_CURR"
target = "TARGET"

# Data cleaning
def clean(df):
    # remove duplicates and keep last occurrences
    if df[id_col].nunique() < df.shape[0]:
        df = df.drop_duplicates(subset=[id_col], keep='last')
    
    # convert data to specified dtypes
    df = df.apply(pd.to_numeric,  errors='ignore')
    
    # transform
    for col in ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'EMERGENCYSTATE_MODE']:
        df[col] = df[col].replace({'Y': 1, 'N': 0, 'Yes': 1, 'No': 0})
    
    # Replace the anomalous values with nan
    df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].where(df['DAYS_EMPLOYED'].abs()<365243, np.nan)
    df = df.replace('XNA', np.nan)
    
    X = df.drop([id_col, target], axis=1)
    y = df[target]
    return X, y

X, y = clean(df)

# Function to calculate missing values by column
def display_missing(df, threshold=None, verbose=True):
    missing_df = pd.DataFrame({
        "missing_number": df.isna().sum(),  # Total missing values
        "missing_rate": df.isna().mean()   # Proportion of missing values
        }, index=df.columns)
    missing_df = missing_df.query("missing_rate>0").sort_values("missing_rate", ascending=False)
    threshold = 0.25 if threshold is None else threshold
    high_missing = missing_df.query(f"missing_rate>{threshold}")
    # Print some summary information
    if verbose:
        print(f"Your selected dataframe has {missing_df.shape[0]} out of {df.shape[1]} columns that have missing values.")
        print(f"There are {high_missing.shape[0]} columns with more than {threshold:.1%} missing values.")
        print("Columns with high missing rate:", high_missing.index.tolist())
    # Return the dataframe with missing information
    if threshold is None:
        return missing_df
    else:
        return high_missing

# Missing values statistics
print(display_missing(df).head(10))

Your selected dataframe has 67 out of 122 columns that have missing values.
There are 50 columns with more than 25.0% missing values.
Columns with high missing rate: ['COMMONAREA_MEDI', 'COMMONAREA_AVG', 'COMMONAREA_MODE', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAPARTMENTS_AVG', 'FONDKAPREMONT_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAPARTMENTS_AVG', 'FLOORSMIN_MODE', 'FLOORSMIN_MEDI', 'FLOORSMIN_AVG', 'YEARS_BUILD_MODE', 'YEARS_BUILD_MEDI', 'YEARS_BUILD_AVG', 'OWN_CAR_AGE', 'LANDAREA_AVG', 'LANDAREA_MEDI', 'LANDAREA_MODE', 'BASEMENTAREA_MEDI', 'BASEMENTAREA_AVG', 'BASEMENTAREA_MODE', 'EXT_SOURCE_1', 'NONLIVINGAREA_MEDI', 'NONLIVINGAREA_MODE', 'NONLIVINGAREA_AVG', 'ELEVATORS_MEDI', 'ELEVATORS_MODE', 'ELEVATORS_AVG', 'WALLSMATERIAL_MODE', 'APARTMENTS_MODE', 'APARTMENTS_MEDI', 'APARTMENTS_AVG', 'ENTRANCES_MODE', 'ENTRANCES_AVG', 'ENTRANCES_MEDI', 'LIVINGAREA_MEDI', 'LIVINGAREA_MODE', 'LIVINGAREA_AVG', 'HOUSETYPE_MODE', 'FLOORSMAX_MEDI', 'FLOORSMAX_AVG', 'FLOORSMAX_MODE', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BEGINEXPLUATATION_MODE', 'TOTALAREA_MODE', 'EMERGENCYSTATE_MODE', 'OCCUPATION_TYPE']
                          missing_number  missing_rate
COMMONAREA_MEDI                   214865      0.698723
COMMONAREA_AVG                    214865      0.698723
COMMONAREA_MODE                   214865      0.698723
NONLIVINGAPARTMENTS_MEDI          213514      0.694330
NONLIVINGAPARTMENTS_MODE          213514      0.694330
NONLIVINGAPARTMENTS_AVG           213514      0.694330
FONDKAPREMONT_MODE                210295      0.683862
LIVINGAPARTMENTS_MODE             210199      0.683550
LIVINGAPARTMENTS_MEDI             210199      0.683550
LIVINGAPARTMENTS_AVG              210199      0.683550

high_missing = display_missing(X, verbose=False).head(10)

plt.figure(figsize = (8, 4))
sns.barplot(x="missing_rate", y=high_missing.index, data=high_missing)
plt.xlabel('Percent of missing values')
plt.ylabel('Features')
plt.title('Percent missing data by feature')

Text(0.5, 1.0, 'Percent missing data by feature')

threshold = int(df.shape[0]*0.2)
X.dropna(axis=1, thresh=threshold).shape

(307511, 120)

# Remove variables with high missing rate

def drop_missing_data(X, threshold=0.8):
    X = X.copy()
    # Remove variables with missing more than threshold(default 20%)
    thresh = int(X.shape[0] * threshold)
    X_new = X.dropna(axis=1, thresh=thresh)
    print(f"Removed {X.shape[1]-X_new.shape[1]} variables with missing more than {1 - threshold:.1%}")
    return X_new

drop_missing_data(X, threshold=0.2).shape

Removed 0 variables with missing more than 80.0%

(307511, 120)

y.groupby(X['DAYS_EMPLOYED'].isna()).mean()

DAYS_EMPLOYED
False    0.086600
True     0.053996
Name: TARGET, dtype: float64

# Adds a binary variable to flag missing observations.

from sklearn.feature_selection import chi2
def flag_missing(X, alpha=0.05):
    """
    Adds a binary variable to flag missing observations(one indicator per variable). 
    The added variables (missing indicators) are named with the original variable name plus '_missing'.
    
    Parameters:
    ----------
    alpha: float, default=0.05
        Features with p-values more than alpha are selected.
    """
    X = X.copy()
    
    # Compute chi-squared stats between each missing indicator and y.
    chi2_stats, p_values = chi2(X.isna(), y)
    # find variables for which indicator should be added.
    missing_indicator = X.loc[:, p_values > alpha]
    indicator_names = missing_indicator.columns.map(lambda x: x + "_missing")
    X[indicator_names] = missing_indicator
    print(f"Added {missing_indicator.shape[1]} missing indicators")
    return X

flag_missing(X).shape

Added 6 missing indicators

(307511, 126)

pd.get_dummies(X["CODE_GENDER"], dummy_na=True).head()

len([col for col in X if set(X[col].unique()) == {0, 1}])

34

def impute_manually(X):
    """
    Replaces missing values by an arbitrary value
    """
    X = X.copy()
    # boolean
    boolean_features = [col for col in X if set(X[col].unique()) == {0, 1}]
    X[boolean_features] = X[boolean_features].fillna(0)
    # fill none
    features_fill_none = ["NAME_TYPE_SUITE", "OCCUPATION_TYPE"]
    X[features_fill_none] = X[features_fill_none].fillna('unknow')
    # fill 0
    features_fill_zero = [
        "OBS_30_CNT_SOCIAL_CIRCLE",  
        "DEF_30_CNT_SOCIAL_CIRCLE",
        "OBS_60_CNT_SOCIAL_CIRCLE",
        "DEF_60_CNT_SOCIAL_CIRCLE",
        "AMT_REQ_CREDIT_BUREAU_HOUR",
        "AMT_REQ_CREDIT_BUREAU_DAY",
        "AMT_REQ_CREDIT_BUREAU_WEEK",
        "AMT_REQ_CREDIT_BUREAU_MON",
        "AMT_REQ_CREDIT_BUREAU_QRT",
        "AMT_REQ_CREDIT_BUREAU_YEAR"
    ]
    X[features_fill_zero] = X[features_fill_zero].fillna(0)
    return X

impute_manually(X).isna().sum().gt(0).sum()

58

X[['AMT_CREDIT', 'AMT_INCOME_TOTAL']].corrwith(X["AMT_ANNUITY"])

AMT_CREDIT          0.770138
AMT_INCOME_TOTAL    0.191657
dtype: float64

# conditional statistic completer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from sklearn.feature_selection import f_classif, chi2
from sklearn.feature_selection import r_regression, f_regression

def fillna_by_groups(X, threshold=(0.0, 0.8), groupby=None, k=2, min_categories=2, bins=10, verbose=False):
    """
    Replaces missing values by groups.
    
    threshold: float, default=None
        Require that percentage of non-NA values in a column to impute.
    k: int, default=2
        Number of top features to group by.
    min_categories: int, default=2
        Specifies an lower limit to the number of categories for each feature to group by.
    bins: int, default=10
    """
    print("Conditional Mean Completer:")
    lower, upper = threshold
    if 0 <= lower < upper <= 1:
        pass
    else:
        raise ValueError("threshold must be a value between 0 < x <= 1. ")
    
    X = pd.DataFrame(X.copy())
    X_bin = X.copy()
    na_size = X.isna().sum().sum()
    
    features_num = X.select_dtypes(include='number').columns.tolist()
    features_cat = X.select_dtypes(exclude='number').columns.tolist()
    X_bin[features_num] = X_bin[features_num].apply(pd.qcut, q=bins, duplicates="drop")
    X_bin[features_cat] = X_bin[features_cat].astype('category')
    X_bin = X_bin.transform(lambda x: x.cat.codes)
    X_bin = X_bin.transform(lambda x: x - x.min()) # for chi-squared to stats each non-negative feature
    
    if groupby is None:
        features_groupby = X_bin.columns.tolist()
    features_groupby = [colname for colname in features_groupby 
                        if X[colname].nunique()>=min_categories]
    
    # Estimate mutual information for a target variable.
    variables = X.columns[X.notna().mean().between(lower, upper)].tolist()
    for colname in variables:
        other_features = list(set(features_groupby) - {colname})
        if colname in features_num:
            score_func = f_regression
        elif colname in features_cat:
            score_func = chi2
        Xy = pd.concat([X_bin[other_features], X[colname]], axis=1).dropna(axis=0,how='any')
        scores, _ = score_func(Xy[other_features], Xy[colname])
        scores = pd.Series(scores, index=other_features).sort_values(ascending=False)
        vars_top_k = scores[:k].index.tolist()
        groups = [X_bin[col] for col in vars_top_k]
        if colname in features_num:
            # Replaces missing values by the mean or median
            X[colname] = X.groupby(groups)[colname].transform(lambda x:x.fillna(x.median()))
            if verbose:
                print(f"Filling the missing values in {colname} with the medians of {vars_top_k} groups.")
        elif colname in features_cat:
            # Replaces missing values by the most frequent category
            X[colname] = X[colname].groupby(groups).transform(lambda x:x.fillna(x.mode(dropna=False)[0]))
            if verbose:
                print(f"Filling the missing values in {colname} with the modes of {vars_top_k} groups.")

    fillna_size = na_size - X.isna().sum().sum()
    print(f"Filled {fillna_size} missing values ({fillna_size/na_size:.1%}).")
    print(f"Transformed {len(variables)} variables with missing (threshold = [{lower:.1%}, {upper:.1%}]).")
    print(f"And then, there are {X.isna().sum().gt(0).sum()} variables with missing.")
    return X

fillna_by_groups(X).isna().sum().sum()

Conditional Mean Completer:
Filled 759918 missing values (8.2%).
Transformed 50 variables with missing (threshold = [0.0%, 80.0%]).
And then, there are 67 variables with missing.

8503299

# Simple imputer

def impute_simply(X, threshold=0.8):
    """
    Univariate imputer for completing missing values with simple strategies.
    """
    print("Simple imputer:")
    X = X.copy()
    variables = X.columns[X.isna().mean().between(0, 1-threshold, "right")].tolist()
    features_num = X[variables].select_dtypes('number').columns.to_list()
    features_cat = X[variables].select_dtypes(exclude='number').columns.to_list()
    # Replaces missing values by the median or mode
    medians = X[features_num].median().to_dict()
    modes = X[features_cat].apply(lambda x: x.mode()[0]).to_dict()
    impute_dict = {**medians, **modes}
    X[variables] = X[variables].fillna(impute_dict)
    print(f"Transformed {len(variables)} variables with missing (threshold={threshold:.1%}).")
    print(f"And then, there are {X.isna().sum().gt(0).sum()} variables with missing.")
    return X

impute_simply(X).isna().sum().gt(0).sum()

Simple imputer:
Transformed 20 variables with missing (threshold=80.0%).
And then, there are 50 variables with missing.

50

print(X.pipe(drop_missing_data, threshold=0.2) 
     .pipe(flag_missing)
     .pipe(impute_manually)
     .pipe(fillna_by_groups, threshold=(0.2, 0.8)) 
     .pipe(impute_simply, threshold=0.2) 
     .isna().sum().gt(0).sum())

Removed 0 variables with missing more than 80.0%
Added 6 missing indicators
Conditional Mean Completer:
Filled 726563 missing values (8.2%).
Transformed 49 variables with missing (threshold = [20.0%, 80.0%]).
And then, there are 61 variables with missing.
Simple imputer:
Transformed 61 variables with missing (threshold=20.0%).
And then, there are 0 variables with missing.
0

class DropMissingData(BaseEstimator, TransformerMixin):
    """
    Remove features from data.
    
    Parameters
    ----------
    threshold: float, default=None
        Require that percentage of non-NA values in a column to keep it.
    """
    def __init__(self, threshold=0.8):
        if 0 < threshold <= 1:
            self.threshold = threshold
        else:
            raise ValueError("threshold must be a value between 0 < x <= 1. ")
    
    def fit(self, X, y=None):
        """
        Find the rows for which missing data should be evaluated to decide if a
        variable should be dropped.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training data set.

        y: pandas Series, default=None
            y is not needed. You can pass None or y.
        """
        
        # check input dataframe
        # X, y = check_X_y(X, y)
        
        # Get the names and number of features in the train set (the dataframe used during fit).
        self.feature_names_in_ = X.columns.to_list()
        self.n_features_in_ = X.shape[1]
        
        # Find the features to drop
        self.variables = X.columns[X.isna().mean().gt(1-self.threshold)].tolist()
        return self
    
    def transform(self, X, y=None):     
        """
        Remove variables with missing more than threshold.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The dataframe to be transformed.

        Returns
        -------
        X_new: pandas dataframe
            The complete case dataframe for the selected variables.
        """
        # Remove variables with missing more than threshold.
        print(f"Removed {len(self.variables)} variables with missing more than {1-self.threshold:.1%}")
        return X.drop(self.variables, axis=1)
    
    def get_feature_names_out(self, input_features=None):
        """
        Get output feature names for transformation. In other words, returns the
        variable names of transformed dataframe.

        Parameters
        ----------
        input_features : array or list, default=None
            This parameter exits only for compatibility with the Scikit-learn pipeline.

            - If `None`, then `feature_names_in_` is used as feature names in.
            - If an array or list, then `input_features` must match `feature_names_in_`.

        Returns
        -------
        feature_names_out: list
            Transformed feature names.
        """
        check_is_fitted(self)
        
        if input_features is None:
            feature_names_in = self.feature_names_in_
        elif len(input_features) == self.n_features_in_:
            # If the input was an array, we let the user enter the variable names.
            feature_names_in = list(input_features)
        else:
            raise ValueError(
                "The number of input_features does not match the number of "
                "features seen in the dataframe used in fit."
                )      
        
        # Remove features.
        feature_names_out = [var for var in feature_names_in if var not in self.variables]
        return feature_names_out

DropMissingData(threshold=0.4).fit(X).variables

['OWN_CAR_AGE',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'FLOORSMIN_AVG',
 'LIVINGAPARTMENTS_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'YEARS_BUILD_MODE',
 'COMMONAREA_MODE',
 'FLOORSMIN_MODE',
 'LIVINGAPARTMENTS_MODE',
 'NONLIVINGAPARTMENTS_MODE',
 'YEARS_BUILD_MEDI',
 'COMMONAREA_MEDI',
 'FLOORSMIN_MEDI',
 'LIVINGAPARTMENTS_MEDI',
 'NONLIVINGAPARTMENTS_MEDI',
 'FONDKAPREMONT_MODE']

# Transformers for missing value imputation

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.impute import MissingIndicator

constant_imputer = FunctionTransformer(
    func=impute_manually,
    accept_sparse=True,
    feature_names_out="one-to-one")

conditional_statistic_imputer = FunctionTransformer(
    func=lambda X:fillna_by_groups(X, threshold=(0.2, 0.8)), 
    accept_sparse=True,
    feature_names_out="one-to-one")

simple_imputer = make_column_transformer(
    (SimpleImputer(strategy="median"), make_column_selector(dtype_include=np.number)),
    (SimpleImputer(strategy="most_frequent"), make_column_selector(dtype_include=object)),
    remainder="passthrough",
    verbose_feature_names_out=False,
    verbose=True)

missing_imputation =  make_pipeline(
    DropMissingData(threshold=0.2),
    constant_imputer,
    conditional_statistic_imputer,
    simple_imputer,
    verbose=True
)

# find variables for which indicator should be added.

add_missing_indicator = make_pipeline(
    MissingIndicator(features='all', sparse=False),
    SelectKBest(k=10),
    verbose=True
)

features = X.columns.tolist()
handle_missing = make_column_transformer(
    (missing_imputation, features),
    (add_missing_indicator, features),
    remainder="passthrough",
    verbose_feature_names_out=False,
    verbose=True
)

X_imputed = handle_missing.fit_transform(X, y)
X_imputed = pd.DataFrame(
    X_imputed,
    columns=handle_missing.get_feature_names_out()
)

X_imputed = X_imputed.astype('category')
X_imputed = X_imputed.apply(pd.to_numeric, errors='ignore')

Removed 0 variables with missing more than 80.0%
[Pipeline] ... (step 1 of 4) Processing dropmissingdata, total=   0.2s
[Pipeline]  (step 2 of 4) Processing functiontransformer-1, total=   0.6s
Conditional Mean Completer:
Filled 726563 missing values (8.2%).
Transformed 49 variables with missing (threshold = [20.0%, 80.0%]).
And then, there are 55 variables with missing.
[Pipeline]  (step 3 of 4) Processing functiontransformer-2, total=  10.8s
[ColumnTransformer]  (1 of 2) Processing simpleimputer-1, total=   2.3s
[ColumnTransformer]  (2 of 2) Processing simpleimputer-2, total=   0.5s
[Pipeline] . (step 4 of 4) Processing columntransformer, total=   3.2s
[ColumnTransformer] .... (1 of 2) Processing pipeline-1, total=  14.9s
[Pipeline] .. (step 1 of 2) Processing missingindicator, total=   1.9s
[Pipeline] ....... (step 2 of 2) Processing selectkbest, total=   0.1s
[ColumnTransformer] .... (2 of 2) Processing pipeline-2, total=   2.0s

X_imputed.isna().sum().sum()

0

X.dtypes.value_counts()

float64    67
int64      40
object     13
Name: count, dtype: int64

features = X.columns.tolist() 
numeric_cols = X.select_dtypes("number").columns.tolist()
categorical_cols = X.select_dtypes(exclude="number").columns.tolist()

len(features) == len(numeric_cols + categorical_cols)

True

int_cols = X.select_dtypes("int64").columns.tolist()
float_cols = X.select_dtypes("float64").columns.tolist()

# The ordinal (ordered) categorical features
# Pandas calls the categories "levels"

ordered_levels = {
    "NAME_EDUCATION_TYPE": ["Lower secondary", 
                            "Secondary / secondary special", 
                            "Incomplete higher", 
                            "Higher education"]
}

from pandas.api.types import CategoricalDtype

def ordinal_encode(X, levels: dict = None):
    X = X.copy()
    if levels is None:
        variables = X.select_dtypes(exclude="number").columns
        X[variables] = X[variables].astype("category")
    else:
        variables = list(levels)
        dtypes = {name: CategoricalDtype(levels[name], ordered=True) for name in levels}
        X[variables] = X[variables].astype(dtypes)
    
    # The `cat.codes` attribute holds the category levels.
    # For missing values, -1 is the default code.
    X[variables] = X[variables].transform(lambda x: x.cat.codes)
    print(f'{len(variables):d} columns were ordinal encoded')
    return X

ordinal_encode(X, ordered_levels)[list(ordered_levels)].value_counts()

1 columns were ordinal encoded

NAME_EDUCATION_TYPE
 1                     218391
 3                      74863
 2                      10277
 0                       3816
-1                        164
Name: count, dtype: int64

# The nominative (unordered) categorical features
nominal_categories = [col for col in categorical_cols if col not in ordered_levels.keys()]

X[nominal_categories].nunique()

NAME_CONTRACT_TYPE             2
CODE_GENDER                    2
NAME_TYPE_SUITE                7
NAME_INCOME_TYPE               8
NAME_FAMILY_STATUS             6
NAME_HOUSING_TYPE              6
OCCUPATION_TYPE               18
WEEKDAY_APPR_PROCESS_START     7
ORGANIZATION_TYPE             57
FONDKAPREMONT_MODE             4
HOUSETYPE_MODE                 3
WALLSMATERIAL_MODE             7
dtype: int64

# Using pandas to encode categorical features
from pandas.api.types import CategoricalDtype

def onehot_encode(X, variables=None, dummy_na=True):
    """
    Replace the categorical variables by the binary variables.
    
    Parameters
    ----------
    X: pd.DataFrame of shape = [n_samples, n_features]
        The data to encode.
        Can be the entire dataframe, not just seleted variables.
    variables: list, default=None
        The list of categorical variables that will be encoded. 
        If None, the encoder will find and encode all variables of type object or categorical by default.
    dummy_na: boolean, default=True

    Returns
    -------
    X_new: pd.DataFrame.
        The encoded dataframe. The shape of the dataframe will be different from
        the original as it includes the dummy variables in place of the of the
        original categorical ones.
    """
    
    # pd.get_dummies automatically convert the categorical column into dummy variables
    if variables is None:
        variables = X.select_dtypes(exclude='number').columns.tolist()
        X = pd.get_dummies(X, dummy_na=True)
    else:
        X_dummy = pd.get_dummies(X[variables].astype(str), dummy_na=True)
        X = pd.concat([X, X_dummy], axis=1)
        # drop the original non-encoded variables.
        X = X.drop(variables, axis=1)
    print(f'{len(variables):d} columns were one-hot encoded')
    print(f'Dataset shape: {X.shape}')
    return X

print(X.pipe(onehot_encode, variables=nominal_categories)
       .pipe(ordinal_encode, levels=ordered_levels)
       .dtypes.value_counts())

12 columns were one-hot encoded
Dataset shape: (307511, 254)
1 columns were ordinal encoded
bool       146
float64     67
int64       40
int8         1
Name: count, dtype: int64

# replace categories by the mean value of the target for each category.

from feature_engine.encoding import MeanEncoder
mean_encoder = MeanEncoder(
    missing_values='ignore', 
    ignore_format=True, 
    unseen='ignore'
)
mean_encoder.fit_transform(X[['OCCUPATION_TYPE', 'ORGANIZATION_TYPE']], y).head()

# Find the correlation of the positive days since birth and target
X['DAYS_BIRTH'].abs().corr(y)

-0.07823930830982709

plt.figure(figsize = (5, 3))
sns.kdeplot(x=X['DAYS_BIRTH'] / -365, hue=y, common_norm=False)
plt.xlabel('Age (years)')
plt.ylabel('Density')
plt.title('Distribution of Ages')

Text(0.5, 1.0, 'Distribution of Ages')

# Bin the age data
age_binned = pd.cut(X['DAYS_BIRTH']/-365, bins = np.linspace(20, 70, num = 11))
age_groups  = y.groupby(age_binned).mean()

plt.figure(figsize = (8, 3))
# Graph the age bins and the average of the target as a bar plot
sns.barplot(x=age_groups.index, y=age_groups*100)
# Plot labeling
plt.xticks(rotation = 30)
plt.xlabel('Age Group (years)')
plt.ylabel('Failure to Repay (%)')
plt.title('Failure to Repay by Age Group');

# Using pandas
def discretize(X, variables=None, bins=10, strategy="uniform", encoding=None):
    """
    Parameters
    ----------
    bucket_labels: dict, default=None
    """
    X = X.copy()
    
    if strategy not in ["uniform", "quantile"]:
        raise ValueError("strategy takes only values 'uniform' or 'quantile'")
    if encoding not in [None, "onehot", "ordinal"]:
        raise ValueError("encoding takes only values None, 'onehot' or 'ordinal'")
    
    if variables is None:
        variables = X.select_dtypes("number").columns.tolist()
    
    if strategy == "uniform":
        X_binned = X[variables].apply(pd.cut, bins=bins, duplicates='drop')
    elif strategy == "quantile":
        X_binned = X[variables].apply(pd.qcut, q=bins, duplicates='drop')
    
    if encoding == "onehot":
        X_binned = pd.get_dummies(X_binned, dummy_na=True)
    elif encoding == "ordinal":
        X_binned = X_binned.apply(lambda x: x.cat.codes)

    X = pd.concat([X.drop(variables, axis=1), X_binned], axis=1)
    return X

discretize(X)[['DAYS_BIRTH', 'DAYS_EMPLOYED']].head()

from sklearn.preprocessing import KBinsDiscretizer

equal_frequency_discretiser = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
equal_width_discretiser = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
kmeans_cluster_discretiser = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='kmeans')

equal_width_discretiser.fit(X[['DAYS_BIRTH', 'DAYS_EMPLOYED']].fillna(0))
for i, col in enumerate(equal_width_discretiser.get_feature_names_out()):
    print(f"{col}'s bin_edges: ")
    print(equal_width_discretiser.bin_edges_[i])

DAYS_BIRTH's bin_edges: 
[-25197.  -22162.  -20463.  -18866.  -17208.  -15740.  -14411.  -13129.7
 -11679.  -10275.   -7489. ]
DAYS_EMPLOYED's bin_edges: 
[-17912.  -4880.  -3239.  -2367.  -1699.  -1221.   -828.   -463.   -147.
      0.]

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

# The ordinal (ordered) categorical features
# Pandas calls the categories "levels"
ordered_levels = {
    "NAME_EDUCATION_TYPE": ["Lower secondary", 
                            "Secondary / secondary special", 
                            "Incomplete higher", 
                            "Higher education"]
}

ordinal_encoder = OrdinalEncoder(
    categories=[np.array(levels) for levels in ordered_levels.values()],
    handle_unknown='use_encoded_value', 
    unknown_value=-1,
    encoded_missing_value=-1)

# replace categories by the mean value of the target for each category.
mean_encoder = MeanEncoder(
    missing_values='ignore', 
    ignore_format=True, 
    unseen='ignore')

# The nominative (unordered) categorical features
nominal_categories = [col for col in categorical_cols if col not in ordered_levels]
features_onehot = [col for col in nominal_categories if col not in ['OCCUPATION_TYPE', 'ORGANIZATION_TYPE']]

onehot_encoder = OneHotEncoder(
    drop='if_binary', 
    min_frequency=0.02, 
    max_categories=20, 
    sparse_output=False,
    handle_unknown='ignore')

# Encode categorical features
categorical_encoding = make_column_transformer(
    (mean_encoder, ['OCCUPATION_TYPE', 'ORGANIZATION_TYPE']),
    (ordinal_encoder, list(ordered_levels)), 
    (onehot_encoder, features_onehot),
    remainder='passthrough', 
    verbose_feature_names_out=False,
    verbose=True
)

X_encoded = categorical_encoding.fit_transform(X_imputed, y)
X_encoded = pd.DataFrame(
    X_encoded, 
    columns=categorical_encoding.get_feature_names_out()
)

# X_encoded = X_encoded.astype('category')
X_encoded = X_encoded.apply(pd.to_numeric, errors='ignore')

[ColumnTransformer] ... (1 of 4) Processing meanencoder, total=   0.0s
[ColumnTransformer]  (2 of 4) Processing ordinalencoder, total=   0.1s
[ColumnTransformer] . (3 of 4) Processing onehotencoder, total=   1.7s
[ColumnTransformer] ..... (4 of 4) Processing remainder, total=   0.0s

X_encoded.dtypes.value_counts()

float64    146
bool        10
Name: count, dtype: int64

X_outlier = X[['DAYS_EMPLOYED', 'AMT_CREDIT']]

fig = plt.figure(figsize=(8,3))
for i, col in enumerate(X_outlier.columns.tolist()):
    ax = fig.add_subplot(1, 2, i+1)
    sns.boxplot(data=X_outlier, y=col, ax=ax)

class OutlierCapper(BaseEstimator, TransformerMixin):
    """
    Caps maximum and/or minimum values of a variable at automatically
    determined values.
    Works only with numerical variables. A list of variables can be indicated. 
    
    Parameters
    ----------
    method: str, 'gaussian' or 'iqr', default='iqr'
        If method='gaussian': 
            - upper limit: mean + 3 * std
            - lower limit: mean - 3 * std
        If method='iqr': 
            - upper limit: 75th quantile + 3 * IQR
            - lower limit: 25th quantile - 3 * IQR
            where IQR is the inter-quartile range: 75th quantile - 25th quantile.
    fold: int, default=3   
        You can select how far out to cap the maximum or minimum values.
    """

    def __init__(self, method='iqr', fold=3, variables=None):
        self.method = method
        self.fold = fold
        self.variables = variables

    def fit(self, X, y=None):
        """
        Learn the values that should be used to replace outliers.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y : pandas Series, default=None
            y is not needed in this transformer. You can pass y or None.
        """
        
        
        # Get the names and number of features in the train set.
        self.feature_names_in_ = X.columns.to_list()
        self.n_features_in_ = X.shape[1]
        
        # find or check for numerical variables
        numeric_vars = X.select_dtypes("number").columns.tolist()
        if self.variables is None:
            self.variables = numeric_vars
        else:
            self.variables = list(set(numeric_vars) & set(self.variables))

        if self.method == "gaussian":
            mean = X[self.variables].mean()
            bias= [mean, mean]
            scale = X[self.variables].std(ddof=0)
        elif self.method == "iqr":
            Q1 = X[self.variables].quantile(q=0.25)
            Q3 = X[self.variables].quantile(q=0.75)
            bias = [Q1, Q3]
            scale = Q3 - Q1         
        
        # estimate the end values
        if (scale == 0).any():
            raise ValueError(
                f"Input columns {scale[scale == 0].index.tolist()!r}"
                f" have low variation for method {self.method!r}."
                f" Try other capping methods or drop these columns."
            )
        else:
            self.upper_limit = bias[1] + self.fold * scale
            self.lower_limit = bias[0] - self.fold * scale   

        self.feature_names_in_ = X.columns.to_list()
        self.n_features_in_ = X.shape[1]

        return self # always return self!

    def transform(self, X, y=None):
        """
        Cap the variable values.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to be transformed.

        Returns
        -------
        X_new: pandas dataframe of shape = [n_samples, n_features]
            The dataframe with the capped variables.
        """
        X = X.copy()
        
        # check if class was fitted
        check_is_fitted(self)
        
        outiers = (X[self.variables].gt(self.upper_limit) | 
                   X[self.variables].lt(self.lower_limit))
        n = outiers.sum().gt(0).sum()
        print(f"Your selected dataframe has {n} out of {outiers.shape[1]} columns that have outliers.")
        
        # replace outliers
        X[self.variables] = X[self.variables].clip(
            axis=1,
            upper=self.upper_limit,
            lower=self.lower_limit
        )
        return X

    def get_feature_names_out(self, input_features=None):
        check_is_fitted(self)

        if input_features is None:
            return self.feature_names_in_
        elif len(input_features) == self.n_features_in_:
            # If the input was an array, we let the user enter the variable names.
            return list(input_features)
        else:
            raise ValueError(
                "The number of input_features does not match the number of "
                "features seen in the dataframe used in fit."
                )

outlier_capper = OutlierCapper()
X_capped = outlier_capper.fit_transform(X_outlier)

fig = plt.figure(figsize=(8,3))
for i, col in enumerate(X_capped.columns.tolist()):
    ax = fig.add_subplot(1, 2, i+1)
    sns.boxplot(data=X_capped, y=col, ax=ax)

Your selected dataframe has 2 out of 2 columns that have outliers.

outlier_capper = OutlierCapper(method="gaussian")
outlier_capper.fit_transform(X.select_dtypes('number')).shape

Your selected dataframe has 92 out of 107 columns that have outliers.

(307511, 107)

from sklearn.ensemble import IsolationForest

class CustomIsolationForest(IsolationForest, TransformerMixin):
    """
    Isolation Forest Algorithm.
    Compute the anomaly score of each sample using the IsolationForest algorithm.
    """
    def __init__(self, drop_outliers=False, **kwargs):
        super().__init__(**kwargs)
        self.drop_outliers = drop_outliers
    def transform(self, X, y=None):  
        anomaly_scores = super().decision_function(X)
        pred = super().predict(X)
        n_outiers = pred[pred == -1].size
        if self.drop_outliers:
            print(f"Remove {n_outiers} outliers from the dataset")
            return X.loc[pred == 1,:]
        else:
            # Return average anomaly score of X.
            print(f"The number of outiers: {n_outiers} ({n_outiers/X.size:.1%})")
            return anomaly_scores.reshape(-1, 1)
    def get_feature_names_out(self, input_features=None):
        if self.drop:
            return self.feature_names_in_
        else:
            return ["anomaly_score"]

# fit the model for anomaly detection
iforest = CustomIsolationForest()
anomaly_score = pd.DataFrame(
    iforest.fit_transform(X_encoded),
    columns=["anomaly_score"]
)
anomaly_score.head()

The number of outiers: 14477 (0.0%)

from sklearn.preprocessing import RobustScaler
pd.DataFrame(RobustScaler().fit_transform(X[['DAYS_EMPLOYED', 'AMT_CREDIT']])).describe()

# Check the skew of all numerical features
skewness = X.select_dtypes('number').skew().sort_values()
skewness[abs(skewness) > 0.75].head(20)

FLAG_MOBIL                     -554.536744
FLAG_CONT_MOBILE                -23.081172
YEARS_BEGINEXPLUATATION_MEDI    -15.573124
YEARS_BEGINEXPLUATATION_AVG     -15.515264
YEARS_BEGINEXPLUATATION_MODE    -14.755318
DAYS_EMPLOYED                    -1.968316
FLAG_EMP_PHONE                   -1.664886
YEARS_BUILD_MODE                 -1.002305
YEARS_BUILD_MEDI                 -0.962784
YEARS_BUILD_AVG                  -0.962485
FLAG_DOCUMENT_3                  -0.925725
FLAG_OWN_REALTY                  -0.840293
EXT_SOURCE_2                     -0.793576
FLOORSMIN_AVG                     0.954197
FLOORSMIN_MEDI                    0.960226
FLOORSMIN_MODE                    0.963835
FLAG_PHONE                        0.974083
CNT_FAM_MEMBERS                   0.987543
FLOORSMAX_AVG                     1.226454
AMT_CREDIT                        1.234778
dtype: float64

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import probplot, norm

def norm_comparison_plot(series):
    series = pd.Series(series)
    mu, sigma = norm.fit(series)
    kurt, skew = series.kurt(), series.skew()
    print(f"Kurtosis: {kurt:.2f}", f"Skewness: {skew:.2f}", sep='\t')
    
    fig = plt.figure(figsize=(10, 4))
    # Now plot the distribution
    ax1 = fig.add_subplot(121)
    ax1.set_title('Distribution')
    ax1.set_ylabel('Frequency')
    sns.distplot(series, fit=norm, ax=ax1)
    ax1.legend(['dist','kde','norm'],f'Normal dist. ($\mu=$ {mu:.2f} and $\sigma=$ {sigma:.2f} )', loc='best')
    # Get also the QQ-plot
    ax2 = fig.add_subplot(122)
    probplot(series, plot=plt)

norm_comparison_plot(X['AMT_CREDIT'])
plt.show()

Kurtosis: 1.93	Skewness: 1.23

from sklearn.preprocessing import PowerTransformer
# Box Cox Transformation of skewed features (instead of log-transformation)
norm_trans = PowerTransformer("box-cox")

amt_credit_transformed = norm_trans.fit_transform(X['AMT_INCOME_TOTAL'].values.reshape(-1, 1))
norm_comparison_plot(amt_credit_transformed[:,0])
plt.show()

Kurtosis: 0.49	Skewness: -0.01

X_prepared = pd.concat([X_encoded, anomaly_score], axis=1) 
print(X_prepared.shape)
print(X_prepared.dtypes.value_counts())

(307511, 157)
float64    147
bool        10
Name: count, dtype: int64

X_prepared.columns = X_prepared.columns.str.replace('/','or').str.replace(' ','_').str.replace(',','_or')

def score_dataset(X, y, nfold=5):
    # Create Dataset object for lightgbm
    dtrain = lgb.Dataset(X, label=y, free_raw_data=False)
    
    #  Use a dictionary to set Parameters.
    params = dict(
        objective='binary',
        is_unbalance=True,
        metric='auc',
        n_estimators=500,
        verbose=0
    )
    
    # Training with 5-fold CV:
    print('Starting training...')
    eval_results = lgb.cv(
        params, 
        dtrain, 
        nfold=nfold,
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(50)],
        return_cvbooster=True
    )
    boosters = eval_results['cvbooster'].boosters
    # Initialize an empty dataframe to hold feature importances
    feature_importances = pd.DataFrame(index=X.columns)
    for i in range(nfold):
        feature_importances[f'cv_{i}'] = boosters[i].feature_importance()
    feature_importances['score'] = feature_importances.mean(axis=1)
    # Sort features according to importance
    feature_importances = feature_importances.sort_values('score', ascending=False)
    return eval_results, feature_importances

eval_results, feature_importances = score_dataset(X_prepared, y)

Starting training...
Training until validation scores don't improve for 50 rounds
[50]	cv_agg's valid auc: 0.754568 + 0.00339662
[100]	cv_agg's valid auc: 0.756576 + 0.00312062
[150]	cv_agg's valid auc: 0.756414 + 0.00303009
Early stopping, best iteration is:
[124]	cv_agg's valid auc: 0.756627 + 0.00292137

prepared_data = pd.concat([df[id_col], X_prepared, y], axis=1)
prepared_data.to_csv('../datasets/Home-Credit-Default-Risk/prepared_data.csv', index=False)

feature_importances['score'].head(20)

EXT_SOURCE_3                  230.4
EXT_SOURCE_1                  227.6
AMT_CREDIT                    213.2
EXT_SOURCE_2                  205.4
AMT_ANNUITY                   188.0
DAYS_BIRTH                    172.8
AMT_GOODS_PRICE               156.2
DAYS_ID_PUBLISH               140.0
DAYS_EMPLOYED                 137.0
DAYS_LAST_PHONE_CHANGE        121.6
DAYS_REGISTRATION             103.6
ORGANIZATION_TYPE              97.8
AMT_INCOME_TOTAL               90.0
REGION_POPULATION_RELATIVE     77.4
anomaly_score                  74.2
OWN_CAR_AGE                    63.2
OCCUPATION_TYPE                58.8
HOUR_APPR_PROCESS_START        48.0
CODE_GENDER_M                  46.8
NAME_EDUCATION_TYPE            42.6
Name: score, dtype: float64

缺失值处理方法	函数	python包
统计量插补	SimpleImputer	sklearn.impute
统计量/随机插补	df.fillna()	pandas
多重插补	IterativeImputer	sklearn.impute
最近邻插补	KNNImputer	sklearn.impute
缺失值删除	df.dropna()	pandas
缺失值标记	MissingIndicator	sklearn.impute
缺失值标记	df.isna(), df.isnull()	pandas

	OCCUPATION_TYPE	ORGANIZATION_TYPE
0	0.105788	0.092996
1	0.063040	0.059148
2	0.105788	0.069781
3	0.105788	0.092996
4	0.063040	0.058824

方法	函数	python包
二值化	Binarizer	sklearn.preprocessing
分箱	KBinsDiscretizer	sklearn.preprocessing
等频分箱	pd.qcut	pandas
等宽分箱	pd.cut	pandas

	DAYS_BIRTH	DAYS_EMPLOYED
0	(-11037.0, -9263.0]	(-1791.2, 0.0]
1	(-18133.0, -16359.0]	(-1791.2, 0.0]
2	(-19907.0, -18133.0]	(-1791.2, 0.0]
3	(-19907.0, -18133.0]	(-3582.4, -1791.2]
4	(-21681.0, -19907.0]	(-3582.4, -1791.2]

方法	python模块
箱线图检测	feature_engine.outliers.Winsorizer
3倍标准差原则	feature_engine.outliers.Winsorizer
聚类检测	self-define
One Class SVM	sklearn.svm.OneClassSVM
Elliptic Envelope	sklearn.linear_model.SGDOneClassSVM
Elliptic Envelope	sklearn.covariance.EllipticEnvelope
Isolation Forest	sklearn.ensemble.IsolationForest
LOF	sklearn.neighbors.LocalOutlierFactor

数据预处理¶

数据清洗¶

数据去重¶

数据类型转换¶

错误数据清洗¶

布尔特征清洗¶

函数封装¶

缺失值处理¶

缺失值统计¶

可视化缺失率¶

缺失值删除¶

缺失值标记¶

人工插补¶

条件平均值填充法¶

简单插补¶

Pipeline实现¶

特征重编码¶

顺序编码¶

哑变量编码¶

平均数编码¶

连续特征分箱¶

Pipeline实现¶

异常值检测¶

箱线图检测¶

3倍标准差原则¶

sklearn异常检测算法¶

Pipeline实现¶

标准化/归一化¶

正态变换¶

偏度¶

QQ图¶

非线性变换¶

Baseline¶

交叉验证¶

特征重要性¶

	F	M	NaN
0	False	True	False
1	True	False	False
2	False	True	False
3	True	False	False
4	False	True	False

方法	函数	python包
顺序编码	OrdinalEncoder	sklearn.preprocessing
顺序编码	CategoricalDtype	pandas.api.types
哑变量编码	OneHotEncoder	sklearn.preprocessing
哑变量编码	pd.get_dummies	pandas
平均数编码	MeanEncoder	feature_engine.encoding import

sklearn.preprocessing	说明
StandardScaler()	z-score标准化
Normalizer(norm='l2')	使用`l1`、`l2`或`max`范数归一化
MinMaxScaler()	min-max归一化
MaxAbsScaler()	Max-abs归一化，缩放稀疏数据的推荐方法
RobustScaler()	分位数归一化，推荐缩放有离群值的数据

	0	1
count	252137.000000	307511.000000
mean	-0.305718	0.158721
std	0.971080	0.747221
min	-6.754153	-0.869825
25%	-0.634136	-0.452114
50%	0.000000	0.000000
75%	0.365864	0.547886
max	0.684385	6.565430

方法	说明
QuantileTransformer	分位数变换，映射到[0,1]之间的均匀分布，或正态分布
PowerTransformer	幂变换，将数据从任何分布映射到尽可能接近高斯分布，以稳定方差并最小化倾斜度

	anomaly_score
0	0.061131
1	0.055532
2	0.086801
3	0.140955
4	0.111094