import numpy as np import pandas as pd import re import sys from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_X_y, check_is_fitted from sklearn.preprocessing import FunctionTransformer from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector from sklearn.pipeline import FeatureUnion, make_union, Pipeline, make_pipeline from sklearn.model_selection import cross_val_score import lightgbm as lgb import matplotlib.pyplot as plt import seaborn as sns import warnings import gc
plt.figure(figsize = (10, 6)) # iterate through the new features for i, feature inenumerate(['CREDIT_INCOME_PERCENT', 'ANNUITY_INCOME_PERCENT', 'CREDIT_TERM', 'DAYS_EMPLOYED_PERCENT']): # create a new subplot for each feature plt.subplot(2, 2, i + 1) sns.kdeplot(data=math_features, x=feature, hue='TARGET', common_norm=False)
discrete_to_trans = [f for f in features_to_trans if X[f].nunique()<50] continuous_to_trans = [f for f in features_to_trans if f notin discrete_to_trans]
defmath_transform( X, y=None, variables=None, func=None, max_depth=2, drop_original=True, verbose=False ): """ Apply math operators to create new features. Parameters ---------- variables: list, default=None The list of input variables. func: List[string], default=['add_numeric', 'subtract_numeric', 'multiply_numeric', 'divide_numeric'] List of Transform Feature functions to apply. drop_original: bool, default=True If True, the original variables to transform will be dropped from the dataframe. """ if variables isNone: variables = X.select_dtypes('number').columns.tolist() df = X[variables].copy() if func isNone: func = ['add_numeric', 'subtract_numeric', 'multiply_numeric', 'divide_numeric'] # Make an entityset and add the entity es = ft.EntitySet(id = 'single_table') es.add_dataframe(dataframe_name='df', dataframe=df, make_index=True, index='id') # Run deep feature synthesis with transformation primitives feature_matrix, features = ft.dfs( entityset = es, target_dataframe_name = 'df', trans_primitives = func, max_depth = max_depth, verbose=verbose ) new_features = feature_matrix.drop(variables, axis=1) new_features.index = X.index if drop_original: return new_features else: return pd.concat([X, new_features], axis=1)
# Group AMT_INCOME_TOTAL by NAME_INCOME_TYPE and calculate mean, max, min of loans X.groupby('OCCUPATION_TYPE')['AMT_INCOME_TOTAL'].agg(['mean', 'max', 'min']).head()
from itertools import product, permutations, combinations from sklearn.preprocessing import KBinsDiscretizer from sklearn.impute import SimpleImputer from sklearn.compose import make_column_selector
classAggFeatures(BaseEstimator, TransformerMixin): """ Transformer to aggregate features in a dataframe. This can be used to create features for each instance of the grouping variable. Parameters ---------- variables: list, default=None The list of input variables. At least one of `variables`, groupby: list, default=None The variables to group by. func: function, string, list List of Aggregation Feature types to apply. Same functionality as parameter `func` in `pandas.agg()`. Build-in func: ['mode', 'kurt', 'frequency', 'num_unique'] Default: - Numeric: ['median', 'max', 'min', 'skew', 'std'] - Category: ['mode', 'num_unique', 'frequency'] n_bins: int, default=10 The number of bins to produce. drop_original: bool, default=True If True, the original variables to transform will be dropped from the dataframe. """ def__init__(self, variables=None, groupby=None, func=None, n_bins=20, drop_original=True): self.variables = variables self.groupby = groupby self.func = func self.n_bins= n_bins self.drop_original = drop_original deffit(self, X, y=None): """ Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. y : pandas Series, default=None y is not needed in this transformer. You can pass y or None. """ # check input dataframe # X, y = check_X_y(X, y) # Get the names and number of features in the train set. self.feature_names_in_ = X.columns.to_list() self.n_features_in_ = X.shape[1] build_in_funcs = {'mode': self.mode, 'kurt': self.kurt, 'frequency': self.frequency, 'num_unique': pd.Series.nunique} assert self.func isnotNone, "Your selected funcs is None." self.func = [build_in_funcs.get(f, f) for f in self.func] if self.variables isNone: self.variables = X.columns.tolist() if self.groupby isNone: self.groupby = X.columns.tolist() return self
deftransform(self, X, y=None): """ Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The data to be transformed. Returns ------- X_new: pandas dataframe of shape = [n_samples, n_features] A dataframe with the statistics aggregated the selected variables. The columns are also renamed to keep track of features created. """ X = X.copy() # check if class was fitted check_is_fitted(self) group_df = self.discretize(X[self.groupby], self.n_bins) # Group by the specified variable and calculate the statistics n = 0 for group_var in self.groupby: # Skip the grouping variable other_vars = [var for var in self.variables if var != group_var] for f in self.func: # Need to create new column names colnames = [f"{f.__name__ ifcallable(f) else f}({var})_by({group_var})" for var in other_vars] X[colnames] = X[other_vars].groupby(group_df[group_var]).transform(f) n += len(colnames) print(f'Created {n} new features.') if self.drop_original: X = X.drop(self.feature_names_in_, axis=1) return X defmode(self, series): return series.mode(dropna=False)[0] defkurt(self, series): return series.kurt() deffrequency(self, series): freq = series.value_counts(normalize=True, dropna=False) return series.map(freq) defdiscretize(self, X, bins=20): X = X.copy() numeric = X.select_dtypes('number').columns continuous = [col for col in numeric if X[col].nunique() >= 50] X[continuous] = X[continuous].apply(pd.qcut, q=bins, duplicates="drop") X = X.astype('category') return X defget_feature_names_out(self, input_features=None): check_is_fitted(self) if input_features isNone: feature_names_in = self.feature_names_in_ eliflen(input_features) == self.n_features_in_: # If the input was an array, we let the user enter the variable names. feature_names_in = list(input_features) else: raise ValueError( "The number of input_features does not match the number of " "features seen in the dataframe used in fit." ) if self.drop_original: feature_names_out = [] else: feature_names_out = feature_names_in func_names = [f.__name__ ifcallable(f) else f for f in self.func] for group_var in feature_names_in: # Skip the grouping variable other_vars = [var for var in self.variables if var != group_var] # Make new column names for the variable and stat colnames = [f"{f}({var})_by({group_var})" for f, var in product(func_names, other_vars)] feature_names_out.extend(colnames) return feature_names_out
deffeature_interaction(X, y=None, left=None, right=None, drop_original=True): """ Parameters ---------- X: pandas dataframe. left, right: The list of interact variables. default=None drop_original: bool, default=True If True, the original variables to transform will be dropped from the dataframe. """ left = X.columns if left isNoneelse left right = X.columns if right isNoneelse right # Make a new dataframe to hold interaction features X_new = pd.DataFrame(index=X.index) for rvar in right: other_vars = [lvar for lvar in left if lvar !=rvar] rseries = X[rvar].astype(str) colnames = [f"{lvar}&{rvar}"for lvar in other_vars] X_new[colnames] = X[other_vars].transform(lambda s: s.astype(str) + "&" + rseries) ifnot drop_original: X_new = pd.concat([X, X_new], axis=1) return X_new
# Empty entity set with id applications es = ft.EntitySet(id = 'clients')
# Entities with a unique index es = es.add_dataframe(dataframe_name= 'app', dataframe = app, index = 'SK_ID_CURR') es = es.add_dataframe(dataframe_name= 'bureau', dataframe = bureau, index = 'SK_ID_BUREAU') es = es.add_dataframe(dataframe_name= 'previous', dataframe = previous, index = 'SK_ID_PREV')
# Entities that do not have a unique index es = es.add_dataframe(dataframe_name= 'bureau_balance', dataframe = bureau_balance, make_index = True, index = 'bureau_balance_index') es = es.add_dataframe(dataframe_name= 'cash', dataframe = cash, make_index = True, index = 'cash_index') es = es.add_dataframe(dataframe_name= 'installments', dataframe = installments, make_index = True, index = 'installments_index') es = es.add_dataframe(dataframe_name= 'credit', dataframe = credit, make_index = True, index = 'credit_index')
# add Relationship es = es.add_relationship('app', 'SK_ID_CURR', 'bureau', 'SK_ID_CURR') es = es.add_relationship('bureau', 'SK_ID_BUREAU', 'bureau_balance', 'SK_ID_BUREAU') es = es.add_relationship('app','SK_ID_CURR', 'previous', 'SK_ID_CURR') es = es.add_relationship('previous', 'SK_ID_PREV', 'cash', 'SK_ID_PREV') es = es.add_relationship('previous', 'SK_ID_PREV', 'installments', 'SK_ID_PREV') es = es.add_relationship('previous', 'SK_ID_PREV', 'credit', 'SK_ID_PREV')
del X, feature_matrix, math_features, agg_features, cluster_similarities gc.collect()
0
defdrop_missing_data(X, threshold=0.8): X = X.copy() # Remove variables with missing more than threshold(default 20%) thresh = int(X.shape[0] * threshold) X_new = X.dropna(axis=1, thresh=thresh) print(f"Removed {X.shape[1]-X_new.shape[1]} variables with missing more than {1 - threshold:.1%}") return X_new
# Simple imputer
defimpute_simply(X, threshold=0.8): """ Univariate imputer for completing missing values with simple strategies. """ print("Simple imputer:") X = X.copy() variables = X.columns[X.isna().mean().between(0, 1-threshold, "right")].tolist() features_num = X[variables].select_dtypes('number').columns.to_list() features_cat = X[variables].select_dtypes(exclude='number').columns.to_list() # Replaces missing values by the median or mode medians = X[features_num].median().to_dict() modes = X[features_cat].apply(lambda x: x.mode()[0]).to_dict() impute_dict = {**medians, **modes} X[variables] = X[variables].fillna(impute_dict) print(f"Transformed {len(variables)} variables with missing (threshold={threshold:.1%}).") print(f"And then, there are {X.isna().sum().gt(0).sum()} variables with missing.") return X
Removed 368 variables with missing more than 20.0%
Simple imputer:
Transformed 551 variables with missing (threshold=80.0%).
And then, there are 0 variables with missing.
Dataset shape: (307511, 2167)
我们继续使用LightGBM模型评估创造的新特征
defscore_dataset(X, y, nfold=5): # Create Dataset object for lightgbm dtrain = lgb.Dataset(X, label=y, free_raw_data=False) # Use a dictionary to set Parameters. params = dict( objective='binary', is_unbalance=True, metric='auc', n_estimators=500, verbose=0 ) # Training with 5-fold CV: print('Starting training...') eval_results = lgb.cv( params, dtrain, nfold=nfold, callbacks=[lgb.early_stopping(50), lgb.log_evaluation(50)], return_cvbooster=True ) boosters = eval_results['cvbooster'].boosters # Initialize an empty dataframe to hold feature importances feature_importances = pd.DataFrame(index=X.columns) for i inrange(nfold): # Record the feature importances feature_importances[f'cv_{i}'] = boosters[i].feature_importance() feature_importances['score'] = feature_importances.mean(axis=1) # Sort features according to importance feature_importances = feature_importances.sort_values('score', ascending=False) return eval_results, feature_importances