import numpy as np import pandas as pd import re from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_X_y, check_is_fitted from sklearn.preprocessing import FunctionTransformer from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector from sklearn.pipeline import FeatureUnion, make_union, Pipeline, make_pipeline from sklearn.model_selection import cross_val_score import lightgbm as lgb import matplotlib.pyplot as plt import seaborn as sns import warnings import gc
# Data cleaning defclean(df): # remove duplicates and keep last occurrences if df[id_col].nunique() < df.shape[0]: df = df.drop_duplicates(subset=[id_col], keep='last') # convert data to specified dtypes df = df.apply(pd.to_numeric, errors='ignore') # transform for col in ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'EMERGENCYSTATE_MODE']: df[col] = df[col].replace({'Y': 1, 'N': 0, 'Yes': 1, 'No': 0}) # Replace the anomalous values with nan df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].where(df['DAYS_EMPLOYED'].abs()<365243, np.nan) df = df.replace('XNA', np.nan) X = df.drop([id_col, target], axis=1) y = df[target] return X, y
# Function to calculate missing values by column defdisplay_missing(df, threshold=None, verbose=True): missing_df = pd.DataFrame({ "missing_number": df.isna().sum(), # Total missing values "missing_rate": df.isna().mean() # Proportion of missing values }, index=df.columns) missing_df = missing_df.query("missing_rate>0").sort_values("missing_rate", ascending=False) threshold = 0.25if threshold isNoneelse threshold high_missing = missing_df.query(f"missing_rate>{threshold}") # Print some summary information if verbose: print(f"Your selected dataframe has {missing_df.shape[0]} out of {df.shape[1]} columns that have missing values.") print(f"There are {high_missing.shape[0]} columns with more than {threshold:.1%} missing values.") print("Columns with high missing rate:", high_missing.index.tolist()) # Return the dataframe with missing information if threshold isNone: return missing_df else: return high_missing
defdrop_missing_data(X, threshold=0.8): X = X.copy() # Remove variables with missing more than threshold(default 20%) thresh = int(X.shape[0] * threshold) X_new = X.dropna(axis=1, thresh=thresh) print(f"Removed {X.shape[1]-X_new.shape[1]} variables with missing more than {1 - threshold:.1%}") return X_new
drop_missing_data(X, threshold=0.2).shape
Removed 0 variables with missing more than 80.0%
(307511, 120)
# Adds a binary variable to flag missing observations.
from sklearn.feature_selection import chi2 defflag_missing(X, alpha=0.05): """ Adds a binary variable to flag missing observations(one indicator per variable). The added variables (missing indicators) are named with the original variable name plus '_missing'. Parameters: ---------- alpha: float, default=0.05 Features with p-values more than alpha are selected. """ X = X.copy() # Compute chi-squared stats between each missing indicator and y. chi2_stats, p_values = chi2(X.isna(), y) # find variables for which indicator should be added. missing_indicator = X.loc[:, p_values > alpha] indicator_names = missing_indicator.columns.map(lambda x: x + "_missing") X[indicator_names] = missing_indicator print(f"Added {missing_indicator.shape[1]} missing indicators") return X
# conditional statistic completer from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import mutual_info_regression, mutual_info_classif from sklearn.feature_selection import f_classif, chi2 from sklearn.feature_selection import r_regression, f_regression
deffillna_by_groups(X, threshold=(0.0, 0.8), groupby=None, k=2, min_categories=2, bins=10, verbose=False): """ Replaces missing values by groups. threshold: float, default=None Require that percentage of non-NA values in a column to impute. k: int, default=2 Number of top features to group by. min_categories: int, default=2 Specifies an lower limit to the number of categories for each feature to group by. bins: int, default=10 """ print("Conditional Mean Completer:") lower, upper = threshold if0 <= lower < upper <= 1: pass else: raise ValueError("threshold must be a value between 0 < x <= 1. ") X = pd.DataFrame(X.copy()) X_bin = X.copy() na_size = X.isna().sum().sum() features_num = X.select_dtypes(include='number').columns.tolist() features_cat = X.select_dtypes(exclude='number').columns.tolist() X_bin[features_num] = X_bin[features_num].apply(pd.qcut, q=bins, duplicates="drop") X_bin[features_cat] = X_bin[features_cat].astype('category') X_bin = X_bin.transform(lambda x: x.cat.codes) X_bin = X_bin.transform(lambda x: x - x.min()) # for chi-squared to stats each non-negative feature if groupby isNone: features_groupby = X_bin.columns.tolist() features_groupby = [colname for colname in features_groupby if X[colname].nunique()>=min_categories] # Estimate mutual information for a target variable. variables = X.columns[X.notna().mean().between(lower, upper)].tolist() for colname in variables: other_features = list(set(features_groupby) - {colname}) if colname in features_num: score_func = f_regression elif colname in features_cat: score_func = chi2 Xy = pd.concat([X_bin[other_features], X[colname]], axis=1).dropna(axis=0,how='any') scores, _ = score_func(Xy[other_features], Xy[colname]) scores = pd.Series(scores, index=other_features).sort_values(ascending=False) vars_top_k = scores[:k].index.tolist() groups = [X_bin[col] for col in vars_top_k] if colname in features_num: # Replaces missing values by the mean or median X[colname] = X.groupby(groups)[colname].transform(lambda x:x.fillna(x.median())) if verbose: print(f"Filling the missing values in {colname} with the medians of {vars_top_k} groups.") elif colname in features_cat: # Replaces missing values by the most frequent category X[colname] = X[colname].groupby(groups).transform(lambda x:x.fillna(x.mode(dropna=False)[0])) if verbose: print(f"Filling the missing values in {colname} with the modes of {vars_top_k} groups.")
fillna_size = na_size - X.isna().sum().sum() print(f"Filled {fillna_size} missing values ({fillna_size/na_size:.1%}).") print(f"Transformed {len(variables)} variables with missing (threshold = [{lower:.1%}, {upper:.1%}]).") print(f"And then, there are {X.isna().sum().gt(0).sum()} variables with missing.") return X
fillna_by_groups(X).isna().sum().sum()
Conditional Mean Completer:
Filled 759918 missing values (8.2%).
Transformed 50 variables with missing (threshold = [0.0%, 80.0%]).
And then, there are 67 variables with missing.
8503299
Removed 0 variables with missing more than 80.0%
Added 6 missing indicators
Conditional Mean Completer:
Filled 726563 missing values (8.2%).
Transformed 49 variables with missing (threshold = [20.0%, 80.0%]).
And then, there are 61 variables with missing.
Simple imputer:
Transformed 61 variables with missing (threshold=20.0%).
And then, there are 0 variables with missing.
0
使用sklearn实现
先自定义几个转换器
classDropMissingData(BaseEstimator, TransformerMixin): """ Remove features from data. Parameters ---------- threshold: float, default=None Require that percentage of non-NA values in a column to keep it. """ def__init__(self, threshold=0.8): if0 < threshold <= 1: self.threshold = threshold else: raise ValueError("threshold must be a value between 0 < x <= 1. ") deffit(self, X, y=None): """ Find the rows for which missing data should be evaluated to decide if a variable should be dropped. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training data set. y: pandas Series, default=None y is not needed. You can pass None or y. """ # check input dataframe # X, y = check_X_y(X, y) # Get the names and number of features in the train set (the dataframe used during fit). self.feature_names_in_ = X.columns.to_list() self.n_features_in_ = X.shape[1] # Find the features to drop self.variables = X.columns[X.isna().mean().gt(1-self.threshold)].tolist() return self deftransform(self, X, y=None): """ Remove variables with missing more than threshold. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The dataframe to be transformed. Returns ------- X_new: pandas dataframe The complete case dataframe for the selected variables. """ # Remove variables with missing more than threshold. print(f"Removed {len(self.variables)} variables with missing more than {1-self.threshold:.1%}") return X.drop(self.variables, axis=1) defget_feature_names_out(self, input_features=None): """ Get output feature names for transformation. In other words, returns the variable names of transformed dataframe. Parameters ---------- input_features : array or list, default=None This parameter exits only for compatibility with the Scikit-learn pipeline. - If `None`, then `feature_names_in_` is used as feature names in. - If an array or list, then `input_features` must match `feature_names_in_`. Returns ------- feature_names_out: list Transformed feature names. """ check_is_fitted(self) if input_features isNone: feature_names_in = self.feature_names_in_ eliflen(input_features) == self.n_features_in_: # If the input was an array, we let the user enter the variable names. feature_names_in = list(input_features) else: raise ValueError( "The number of input_features does not match the number of " "features seen in the dataframe used in fit." ) # Remove features. feature_names_out = [var for var in feature_names_in if var notin self.variables] return feature_names_out
# Using pandas to encode categorical features from pandas.api.types import CategoricalDtype
defonehot_encode(X, variables=None, dummy_na=True): """ Replace the categorical variables by the binary variables. Parameters ---------- X: pd.DataFrame of shape = [n_samples, n_features] The data to encode. Can be the entire dataframe, not just seleted variables. variables: list, default=None The list of categorical variables that will be encoded. If None, the encoder will find and encode all variables of type object or categorical by default. dummy_na: boolean, default=True Returns ------- X_new: pd.DataFrame. The encoded dataframe. The shape of the dataframe will be different from the original as it includes the dummy variables in place of the of the original categorical ones. """ # pd.get_dummies automatically convert the categorical column into dummy variables if variables isNone: variables = X.select_dtypes(exclude='number').columns.tolist() X = pd.get_dummies(X, dummy_na=True) else: X_dummy = pd.get_dummies(X[variables].astype(str), dummy_na=True) X = pd.concat([X, X_dummy], axis=1) # drop the original non-encoded variables. X = X.drop(variables, axis=1) print(f'{len(variables):d} columns were one-hot encoded') print(f'Dataset shape: {X.shape}') return X
一般情况下,针对分类特征,我们只需要使用sklearn的OneHotEncoder或OrdinalEncoder进行编码,这类简单的预处理能够满足大多数数据挖掘算法的需求。如果某一个分类特征的可能值非常多(高基数 high cardinality),那么再使用one-hot编码往往会出现维度爆炸。平均数编码(mean encoding)是一种高效的编码方式,在实际应用中,能极大提升模型的性能。
# Bin the age data age_binned = pd.cut(X['DAYS_BIRTH']/-365, bins = np.linspace(20, 70, num = 11)) age_groups = y.groupby(age_binned).mean()
plt.figure(figsize = (8, 3)) # Graph the age bins and the average of the target as a bar plot sns.barplot(x=age_groups.index, y=age_groups*100) # Plot labeling plt.xticks(rotation = 30) plt.xlabel('Age Group (years)') plt.ylabel('Failure to Repay (%)') plt.title('Failure to Repay by Age Group');
equal_width_discretiser.fit(X[['DAYS_BIRTH', 'DAYS_EMPLOYED']].fillna(0)) for i, col inenumerate(equal_width_discretiser.get_feature_names_out()): print(f"{col}'s bin_edges: ") print(equal_width_discretiser.bin_edges_[i])
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder from sklearn.compose import ColumnTransformer
# The ordinal (ordered) categorical features # Pandas calls the categories "levels" ordered_levels = { "NAME_EDUCATION_TYPE": ["Lower secondary", "Secondary / secondary special", "Incomplete higher", "Higher education"] }
ordinal_encoder = OrdinalEncoder( categories=[np.array(levels) for levels in ordered_levels.values()], handle_unknown='use_encoded_value', unknown_value=-1, encoded_missing_value=-1)
# replace categories by the mean value of the target for each category. mean_encoder = MeanEncoder( missing_values='ignore', ignore_format=True, unseen='ignore')
# The nominative (unordered) categorical features nominal_categories = [col for col in categorical_cols if col notin ordered_levels] features_onehot = [col for col in nominal_categories if col notin ['OCCUPATION_TYPE', 'ORGANIZATION_TYPE']]
fig = plt.figure(figsize=(8,3)) for i, col inenumerate(X_outlier.columns.tolist()): ax = fig.add_subplot(1, 2, i+1) sns.boxplot(data=X_outlier, y=col, ax=ax)
classOutlierCapper(BaseEstimator, TransformerMixin): """ Caps maximum and/or minimum values of a variable at automatically determined values. Works only with numerical variables. A list of variables can be indicated. Parameters ---------- method: str, 'gaussian' or 'iqr', default='iqr' If method='gaussian': - upper limit: mean + 3 * std - lower limit: mean - 3 * std If method='iqr': - upper limit: 75th quantile + 3 * IQR - lower limit: 25th quantile - 3 * IQR where IQR is the inter-quartile range: 75th quantile - 25th quantile. fold: int, default=3 You can select how far out to cap the maximum or minimum values. """
deffit(self, X, y=None): """ Learn the values that should be used to replace outliers. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. y : pandas Series, default=None y is not needed in this transformer. You can pass y or None. """ # Get the names and number of features in the train set. self.feature_names_in_ = X.columns.to_list() self.n_features_in_ = X.shape[1] # find or check for numerical variables numeric_vars = X.select_dtypes("number").columns.tolist() if self.variables isNone: self.variables = numeric_vars else: self.variables = list(set(numeric_vars) & set(self.variables))
if self.method == "gaussian": mean = X[self.variables].mean() bias= [mean, mean] scale = X[self.variables].std(ddof=0) elif self.method == "iqr": Q1 = X[self.variables].quantile(q=0.25) Q3 = X[self.variables].quantile(q=0.75) bias = [Q1, Q3] scale = Q3 - Q1 # estimate the end values if (scale == 0).any(): raise ValueError( f"Input columns {scale[scale == 0].index.tolist()!r}" f" have low variation for method {self.method!r}." f" Try other capping methods or drop these columns." ) else: self.upper_limit = bias[1] + self.fold * scale self.lower_limit = bias[0] - self.fold * scale
deftransform(self, X, y=None): """ Cap the variable values. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The data to be transformed. Returns ------- X_new: pandas dataframe of shape = [n_samples, n_features] The dataframe with the capped variables. """ X = X.copy() # check if class was fitted check_is_fitted(self) outiers = (X[self.variables].gt(self.upper_limit) | X[self.variables].lt(self.lower_limit)) n = outiers.sum().gt(0).sum() print(f"Your selected dataframe has {n} out of {outiers.shape[1]} columns that have outliers.") # replace outliers X[self.variables] = X[self.variables].clip( axis=1, upper=self.upper_limit, lower=self.lower_limit ) return X
if input_features isNone: return self.feature_names_in_ eliflen(input_features) == self.n_features_in_: # If the input was an array, we let the user enter the variable names. returnlist(input_features) else: raise ValueError( "The number of input_features does not match the number of " "features seen in the dataframe used in fit." )
fig = plt.figure(figsize=(8,3)) for i, col inenumerate(X_capped.columns.tolist()): ax = fig.add_subplot(1, 2, i+1) sns.boxplot(data=X_capped, y=col, ax=ax)
Your selected dataframe has 2 out of 2 columns that have outliers.
classCustomIsolationForest(IsolationForest, TransformerMixin): """ Isolation Forest Algorithm. Compute the anomaly score of each sample using the IsolationForest algorithm. """ def__init__(self, drop_outliers=False, **kwargs): super().__init__(**kwargs) self.drop_outliers = drop_outliers deftransform(self, X, y=None): anomaly_scores = super().decision_function(X) pred = super().predict(X) n_outiers = pred[pred == -1].size if self.drop_outliers: print(f"Remove {n_outiers} outliers from the dataset") return X.loc[pred == 1,:] else: # Return average anomaly score of X. print(f"The number of outiers: {n_outiers} ({n_outiers/X.size:.1%})") return anomaly_scores.reshape(-1, 1) defget_feature_names_out(self, input_features=None): if self.drop: return self.feature_names_in_ else: return ["anomaly_score"]
# fit the model for anomaly detection iforest = CustomIsolationForest() anomaly_score = pd.DataFrame( iforest.fit_transform(X_encoded), columns=["anomaly_score"] ) anomaly_score.head()
from sklearn.preprocessing import PowerTransformer # Box Cox Transformation of skewed features (instead of log-transformation) norm_trans = PowerTransformer("box-cox")