from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml import Estimator, Transformer
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
import pyspark.sql.functions as fn
import pyspark.ml.feature as ft
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
from pyspark.sql import Observation
from pyspark.sql import Window
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from xgboost.spark import SparkXGBClassifier
import xgboost as xgb

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import time
import warnings

# Setting configuration.
warnings.filterwarnings('ignore')
SEED = 42

# Use 0.11.4-spark3.3 version for Spark3.3 and 1.0.2 version for Spark3.4
spark = SparkSession.builder \
            .master("local[*]") \
            .appName("XGBoost with PySpark") \
            .config("spark.driver.memory", "10g") \
            .config("spark.driver.cores", "2") \
            .config("spark.executor.memory", "10g") \
            .config("spark.executor.cores", "2") \
            .enableHiveSupport() \
            .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('ERROR')

24/06/01 11:20:13 WARN Utils: Your hostname, MacBook-Air resolves to a loopback address: 127.0.0.1; using 192.168.1.5 instead (on interface en0)
24/06/01 11:20:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/01 11:20:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable

df = spark.sql("select * from home_credit_default_risk.application_train")

Loading class `com.mysql.jdbc.Driver'. This is deprecated. The new driver class is `com.mysql.cj.jdbc.Driver'. The driver is automatically registered via the SPI and manual loading of the driver class is generally unnecessary.

df.limit(5).toPandas()

print(f"dataset shape: ({df.count()}, {len(df.columns)})")

[Stage 1:====================================>                    (16 + 8) / 25]

dataset shape: (307511, 122)

# df.printSchema()

# Number of each type of column
dtypes = dict(df.dtypes)
pd.Series(dtypes).value_counts()

double    65
int       41
string    16
Name: count, dtype: int64

df.summary().toPandas()

Java HotSpot(TM) 64-Bit Server VM warning: CodeCache is full. Compiler has been disabled.
Java HotSpot(TM) 64-Bit Server VM warning: Try increasing the code cache size using -XX:ReservedCodeCacheSize=
[Stage 4:==================>                                       (8 + 8) / 25]

CodeCache: size=131072Kb used=51303Kb max_used=51313Kb free=79768Kb
 bounds [0x0000000106830000, 0x0000000109b20000, 0x000000010e830000]
 total_blobs=17400 nmethods=16279 adapters=1033
 compilation: disabled (not enough contiguous free space left)

# `TARGET` is the target variable we are trying to predict (0 or 1):
# 1 = Not Repaid 
# 0 = Repaid

# Check if the data is unbalanced
row = df.select(fn.mean('TARGET').alias('rate')).first()
print(f"percentage of default : {row['rate']:.2%}")
df.groupBy("TARGET").count().show()

percentage of default : 8.07%

+------+------+
|TARGET| count|
+------+------+
|     1| 24825|
|     0|282686|
+------+------+

# `SK_ID_CURR` is the unique id of the row.
df.dropDuplicates(subset=["SK_ID_CURR"]).count() == df.count()

True

dtypes = df.drop("SK_ID_CURR", "TARGET").dtypes

categorical_cols = [k for k, v in dtypes if v == 'string']
numerical_cols = [k for k, v in dtypes if v != 'string']

# df = df.withColumn('HOUR_APPR_PROCESS_START', df['HOUR_APPR_PROCESS_START'].astype(str))

df.select(df['DAYS_BIRTH'] / -365).summary().show()

[Stage 19:===================================>                    (16 + 8) / 25]

+-------+-------------------+
|summary|(DAYS_BIRTH / -365)|
+-------+-------------------+
|  count|             307511|
|   mean|  43.93697278587162|
| stddev| 11.956133237768654|
|    min| 20.517808219178082|
|    25%|  34.00547945205479|
|    50%|  43.14794520547945|
|    75%| 53.917808219178085|
|    max|  69.12054794520547|
+-------+-------------------+

for feature in ['DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH']:
        print(f'{feature} info: ')
        df.select(df[feature] / -365).summary().show()

DAYS_BIRTH info:

+-------+-------------------+
|summary|(DAYS_BIRTH / -365)|
+-------+-------------------+
|  count|             307511|
|   mean|  43.93697278587162|
| stddev| 11.956133237768654|
|    min| 20.517808219178082|
|    25%|  34.00547945205479|
|    50%|  43.14794520547945|
|    75%| 53.917808219178085|
|    max|  69.12054794520547|
+-------+-------------------+

DAYS_EMPLOYED info:

+-------+----------------------+
|summary|(DAYS_EMPLOYED / -365)|
+-------+----------------------+
|  count|                307511|
|   mean|   -174.83574220287002|
| stddev|    387.05689457185537|
|    min|   -1000.6657534246575|
|    25%|    0.7917808219178082|
|    50%|    3.3232876712328765|
|    75%|     7.558904109589041|
|    max|     49.07397260273972|
+-------+----------------------+

DAYS_REGISTRATION info:

+-------+--------------------------+
|summary|(DAYS_REGISTRATION / -365)|
+-------+--------------------------+
|  count|                    307511|
|   mean|        13.660603637091562|
| stddev|         9.651743345104306|
|    min|                      -0.0|
|    25%|         5.504109589041096|
|    50%|        12.336986301369864|
|    75%|        20.487671232876714|
|    max|         67.59452054794521|
+-------+--------------------------+

DAYS_ID_PUBLISH info:

[Stage 31:===================================>                    (16 + 8) / 25]

+-------+------------------------+
|summary|(DAYS_ID_PUBLISH / -365)|
+-------+------------------------+
|  count|                  307511|
|   mean|        8.20329417328335|
| stddev|       4.135480600008283|
|    min|                    -0.0|
|    25%|      4.7095890410958905|
|    50%|       8.915068493150685|
|    75%|      11.775342465753425|
|    max|       19.71780821917808|
+-------+------------------------+

buckets = df.select((df['DAYS_EMPLOYED'] / -365).alias('DAYS_EMPLOYED'))

bucketizer = ft.QuantileDiscretizer(numBuckets=10, inputCol='DAYS_EMPLOYED', outputCol='buckets').fit(buckets)
buckets = bucketizer.transform(buckets)

buckets.groupBy('buckets').count().sort('buckets').show()
bucketizer.getSplits()

[Stage 36:===================================>                    (16 + 8) / 25]

+-------+-----+
|buckets|count|
+-------+-----+
|    1.0|61425|
|    2.0|30699|
|    3.0|30733|
|    4.0|30685|
|    5.0|30741|
|    6.0|30716|
|    7.0|30750|
|    8.0|30726|
|    9.0|31036|
+-------+-----+

[-inf,
 -1000.6657534246575,
 0.39452054794520547,
 1.252054794520548,
 2.2465753424657535,
 3.317808219178082,
 4.635616438356164,
 6.457534246575342,
 8.827397260273973,
 13.2986301369863,
 inf]

# Replace the anomalous values with nan
df_emp = df.select(fn.when(df['DAYS_EMPLOYED']>=365243, None).otherwise(df['DAYS_EMPLOYED']).alias('DAYS_EMPLOYED'))

df_emp.sample(0.1).toPandas().plot.hist(title = 'Days Employment Histogram')
plt.xlabel('Days Employment')

Text(0.5, 0, 'Days Employment')

for col in categorical_cols:
    unique_count = df.select(col).dropna().distinct().count()
    if unique_count == 2:
        df.groupBy(col).count().show()

+------------------+------+
|NAME_CONTRACT_TYPE| count|
+------------------+------+
|   Revolving loans| 29279|
|        Cash loans|278232|
+------------------+------+

+------------+------+
|FLAG_OWN_CAR| count|
+------------+------+
|           Y|104587|
|           N|202924|
+------------+------+

+---------------+------+
|FLAG_OWN_REALTY| count|
+---------------+------+
|              Y|213312|
|              N| 94199|
+---------------+------+

[Stage 145:================================================>      (22 + 3) / 25]

+-------------------+------+
|EMERGENCYSTATE_MODE| count|
+-------------------+------+
|               NULL|145755|
|                 No|159428|
|                Yes|  2328|
+-------------------+------+

cols_to_transform = ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'EMERGENCYSTATE_MODE']
df.replace(
    ['Y', 'N', 'Yes', 'No'], ['1', '0', '1', '0'], 
    subset=cols_to_transform
).select(cols_to_transform).show(5)

+------------+---------------+-------------------+
|FLAG_OWN_CAR|FLAG_OWN_REALTY|EMERGENCYSTATE_MODE|
+------------+---------------+-------------------+
|           1|              0|                  0|
|           0|              1|                  0|
|           1|              1|               NULL|
|           0|              1|               NULL|
|           0|              1|                  0|
+------------+---------------+-------------------+
only showing top 5 rows

dtypes = df.drop("SK_ID_CURR", "TARGET").dtypes
categorical_cols = [k for k, v in dtypes if v == 'string']
numerical_cols = [k for k, v in dtypes if v != 'string']

# Data cleaning
def clean(df):
    # remove duplicates.
    df = df.dropDuplicates(subset=["SK_ID_CURR"])
    
    # transform
    cols_to_transform = ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'EMERGENCYSTATE_MODE']
    df = df.replace(
        ['Y', 'N', 'Yes', 'No'], ['1', '0', '1', '0'], 
        subset=cols_to_transform
    )
    df = df.withColumns({c: df[c].cast('int') for c in cols_to_transform})
    
    # Replace the anomalous values with nan
    df = df.withColumn('DAYS_EMPLOYED', 
        fn.when(df['DAYS_EMPLOYED']>=365243, None).otherwise(df['DAYS_EMPLOYED'])
    )
    
    df = df.replace('XNA', None)
    df = df.withColumnRenamed("TARGET", "label")
    return df

df = clean(df)

df.select([fn.countDistinct(col).alias(col) for col in categorical_cols]).show(1, vertical=True)

[Stage 149:===================================================>  (96 + 4) / 100]

-RECORD 0-------------------------
 NAME_CONTRACT_TYPE         | 2   
 CODE_GENDER                | 2   
 FLAG_OWN_CAR               | 2   
 FLAG_OWN_REALTY            | 2   
 NAME_TYPE_SUITE            | 7   
 NAME_INCOME_TYPE           | 8   
 NAME_EDUCATION_TYPE        | 5   
 NAME_FAMILY_STATUS         | 6   
 NAME_HOUSING_TYPE          | 6   
 OCCUPATION_TYPE            | 18  
 WEEKDAY_APPR_PROCESS_START | 7   
 ORGANIZATION_TYPE          | 57  
 FONDKAPREMONT_MODE         | 4   
 HOUSETYPE_MODE             | 3   
 WALLSMATERIAL_MODE         | 7   
 EMERGENCYSTATE_MODE        | 2

# The ordinal (ordered) categorical features
# Pandas calls the categories "levels"

ordered_levels = {
    "NAME_EDUCATION_TYPE": ["Lower secondary", 
                            "Secondary / secondary special", 
                            "Incomplete higher", 
                            "Higher education"]
}

def ordinal_encode(df, levels):
    for var, to_replace in levels.items():
        mapping = {v: str(i) for i,v in enumerate(to_replace)}
        df = df.replace(mapping, subset=[var])
        df = df.withColumn(var, df[var].cast('int'))
    print(f'{len(levels):d} columns were ordinal encoded')
    return df

ordinal_encode(df, ordered_levels).groupBy(*ordered_levels.keys()).count().show()

1 columns were ordinal encoded

[Stage 155:==============================================>       (87 + 8) / 100]

+-------------------+------+
|NAME_EDUCATION_TYPE| count|
+-------------------+------+
|               NULL|   164|
|                  1|218391|
|                  3| 74863|
|                  2| 10277|
|                  0|  3816|
+-------------------+------+

class MeanEncoder(Estimator, Transformer):
    def __init__(self, smoothing=0.0, inputCols=None, labelCol="label"):
        """
        The MeanEncoder() replaces categories by the mean value of the target for each
        category.
        
        math:
            mapping = (w_i) posterior + (1-w_i) prior
        where
            w_i = n_i t / (s + n_i t)
        
        In the previous equation, t is the target variance in the entire dataset, s is the
        target variance within the category and n is the number of observations for the
        category.
        
        Parameters
        ----------
        smoothing: int, float, 'auto', default=0.0
        """
        super().__init__()
        self.smoothing = smoothing
        self.inputCols = inputCols
        self.labelCol = labelCol
    
    def _fit(self, df):
        """
        Learn the mean value of the target for each category of the variable.
        """

        self.encoder_dict = {}
        inputCols = self.inputCols
        labelCol = self.labelCol
        y_prior = df.select(fn.mean(labelCol).alias("mean")).first()["mean"]
        
        for var in inputCols:
            if self.smoothing == "auto":
                y_var = df.cov(labelCol, labelCol)
                damping = fn.variance(labelCol) / y_var
            else:
                damping = fn.lit(self.smoothing)
            
            groups = df.groupBy(var).agg(
                fn.mean(labelCol).alias("posterior"),
                fn.count("*").alias("counts"),
                damping.alias("damping") 
            ).toPandas().dropna()
            
            groups["lambda"] = groups["counts"] / (groups["counts"] + groups["damping"])
            groups["code"] = (
                groups["lambda"] * groups["posterior"] + 
                    (1.0 - groups["lambda"]) * y_prior
            )
            
            self.encoder_dict[var] = dict(zip(groups[var], groups["code"]))
        return self
    
    def _transform(self, df):
        for var in self.encoder_dict:
            mapping = {k: str(v) for k,v in self.encoder_dict[var].items()}
            df = df.replace(mapping, subset=[var])
            df = df.withColumn(var, df[var].cast('float'))

        print(f'{len(self.encoder_dict):d} columns were mean encoded')
        return df

# replace categories by the mean value of the target for each category.
inputCols = ['OCCUPATION_TYPE', 'ORGANIZATION_TYPE']
mean_encoder = MeanEncoder(
    inputCols=inputCols, 
    labelCol='label',
    smoothing='auto'
)
mean_encoder.fit(df).transform(df).select(inputCols).show(5)

2 columns were mean encoded
+---------------+-----------------+
|OCCUPATION_TYPE|ORGANIZATION_TYPE|
+---------------+-----------------+
|    0.062140968|       0.09299603|
|     0.09631742|       0.09449421|
|    0.113258936|       0.10173836|
|           NULL|             NULL|
|           NULL|             NULL|
+---------------+-----------------+
only showing top 5 rows

# The nominative (unordered) categorical features
encoded_cols = ['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE']
nominal_categories = [col for col in categorical_cols if col not in encoded_cols]

indexedCols = [f"indexed_{col}" for col in nominal_categories]
vectorCols = [f"encoded_{col}" for col in nominal_categories]

onehot_encoder = Pipeline(stages=[
    StringIndexer(
        inputCols=nominal_categories, 
        outputCols=indexedCols,
        handleInvalid='keep'
    ),
    OneHotEncoder(
        inputCols=indexedCols,
        outputCols=vectorCols
    )
])
onehot_encoder.fit(df).transform(df).select(vectorCols).limit(5).toPandas()

# Find the correlation of the positive days since birth and target
df.select((df['DAYS_BIRTH'] / -365).alias('age'), 'label').corr('age', "label")

-0.07823930830982699

sample = df.sample(0.1).select((df['DAYS_BIRTH']/fn.lit(-365)).alias("age"), "label").toPandas()

plt.figure(figsize = (5, 3))
sns.kdeplot(data=sample, x="age", hue="label", common_norm=False)
plt.xlabel('Age (years)')
plt.ylabel('Density')
plt.title('Distribution of Ages')

Text(0.5, 1.0, 'Distribution of Ages')

# Bin the age data
age_binned = pd.cut(sample['age'], bins = np.linspace(20, 70, num = 11))
age_groups  = sample['label'].groupby(age_binned).mean()

plt.figure(figsize = (8, 3))
# Graph the age bins and the average of the target as a bar plot
sns.barplot(x=age_groups.index, y=age_groups*100)
# Plot labeling
plt.xticks(rotation = 30)
plt.xlabel('Age Group (years)')
plt.ylabel('Failure to Repay (%)')
plt.title('Failure to Repay by Age Group')

Text(0.5, 1.0, 'Failure to Repay by Age Group')

bucketizer = ft.QuantileDiscretizer(
    numBuckets=10,
    handleInvalid='keep',
    inputCols=['DAYS_BIRTH', 'DAYS_EMPLOYED'], 
    outputCols=["buckets1", "buckets2"]
).fit(df)

splits = bucketizer.getSplitsArray() # bin_edges
for c, s in zip(['DAYS_BIRTH', 'DAYS_EMPLOYED'], splits):
    print(f"{c}'s bin_edges:")
    print(s)

[Stage 182:==============================================>       (87 + 9) / 100]

DAYS_BIRTH's bin_edges:
[-inf, -22185.0, -20480.0, -18892.0, -17228.0, -15759.0, -14425.0, -13153.0, -11706.0, -10296.0, inf]
DAYS_EMPLOYED's bin_edges:
[-inf, -5338.0, -3679.0, -2795.0, -2164.0, -1650.0, -1253.0, -922.0, -619.0, -336.0, inf]

dtypes = df.drop("SK_ID_CURR", "TARGET").dtypes
categorical_cols = [k for k, v in dtypes if v == 'string']
numerical_cols = [k for k, v in dtypes if v != 'string']   

def encode(df):
    # The ordinal (ordered) categorical features
    # Pandas calls the categories "levels"
    ordered_levels = {
        "NAME_EDUCATION_TYPE": ["Lower secondary", 
                                "Secondary / secondary special", 
                                "Incomplete higher", 
                                "Higher education"]
    }
    df = ordinal_encode(df, ordered_levels)
    
    # replace categories by the mean value of the target for each category.
    mean_encoder = MeanEncoder(
        inputCols=['OCCUPATION_TYPE', 'ORGANIZATION_TYPE'], 
        labelCol='label',
        smoothing='auto'
    )
    df = mean_encoder.fit(df).transform(df)
    
    # The nominative (unordered) categorical features
    nominal_categories = [col for col in categorical_cols if col not in ordered_levels]
    features_onehot = [col for col in nominal_categories if col not in ['OCCUPATION_TYPE', 'ORGANIZATION_TYPE']]

    indexedCols = [f"indexed_{col}" for col in features_onehot]
    encodedCols = [f"encoded_{col}" for col in features_onehot]

    onehot_encoder = Pipeline(stages=[
        StringIndexer(
            inputCols=features_onehot, 
            outputCols=indexedCols,
            handleInvalid='keep'
        ),
        OneHotEncoder(
            inputCols=indexedCols,
            outputCols=encodedCols
        )
    ])
    
    df = onehot_encoder.fit(df).transform(df).drop(*features_onehot + indexedCols)
    print(f'{len(features_onehot):d} columns were one-hot encoded')
    
    colsMap = dict(zip(encodedCols, features_onehot))
    df = df.withColumnsRenamed(colsMap)
    return df

# Encode categorical features
df_encoded = encode(df)
df_encoded.select(categorical_cols).limit(5).toPandas()

1 columns were ordinal encoded

2 columns were mean encoded

10 columns were one-hot encoded

pd.Series(dict(df_encoded.dtypes)).value_counts()

double    65
int       45
vector    10
float      2
Name: count, dtype: int64

# Function to calculate missing values by column
def display_missing(df, threshold=None, verbose=1):
    n = df.count()
    exprs = [fn.sum(df[col].isNull().cast('int')).alias(col) for col in df.columns]
    missing_number = df.select(*exprs).first().asDict()
    missing_df = pd.DataFrame({
        "missing_number": missing_number.values(),  # Total missing values
        "missing_rate": [value / n for value in missing_number.values()]   # Proportion of missing values
        }, index=missing_number.keys())
    missing_df = missing_df.query("missing_rate>0").sort_values("missing_rate", ascending=False)
    threshold = 0.25 if threshold is None else threshold
    high_missing = missing_df.query(f"missing_rate>{threshold}")
    # Print some summary information
    if verbose:
        print(f"Your selected dataframe has {missing_df.shape[0]} out of {len(df.columns)} columns that have missing values.")
    # Return the dataframe with missing information
    if threshold is None:
        return missing_df
    else:
        if verbose:
            print(f"There are {high_missing.shape[0]} columns with more than {threshold:.1%} missing values.")
        return high_missing

# Missing values statistics
print(display_missing(df_encoded).head(10))

[Stage 206:===================================================>  (96 + 4) / 100]

Your selected dataframe has 66 out of 122 columns that have missing values.
There are 47 columns with more than 25.0% missing values.
                          missing_number  missing_rate
COMMONAREA_MEDI                   214865      0.698723
COMMONAREA_MODE                   214865      0.698723
COMMONAREA_AVG                    214865      0.698723
NONLIVINGAPARTMENTS_MODE          213514      0.694330
NONLIVINGAPARTMENTS_MEDI          213514      0.694330
NONLIVINGAPARTMENTS_AVG           213514      0.694330
LIVINGAPARTMENTS_MODE             210199      0.683550
LIVINGAPARTMENTS_MEDI             210199      0.683550
LIVINGAPARTMENTS_AVG              210199      0.683550
FLOORSMIN_MODE                    208642      0.678486

# Remove variables with high missing rate

def drop_missing_data(df, threshold=0.8):
    # Remove variables with missing more than threshold(default 20%)
    thresh = int(df.count() * (1 - threshold))
    exprs = [fn.sum(df[col].isNull().cast('int')).alias(col) for col in df.columns]
    missing_number = df.select(*exprs).first().asDict()
    cols_to_drop = [k for k,v in missing_number.items() if v > thresh]
    print(f"Removed {len(cols_to_drop)} variables with missing more than {1 - threshold:.1%}")
    return df.drop(*cols_to_drop)

_ = drop_missing_data(df_encoded, threshold=0.2)

[Stage 212:===================================================>  (96 + 4) / 100]

Removed 0 variables with missing more than 80.0%

df_encoded.groupBy(df_encoded['DAYS_EMPLOYED'].isNull()).mean('label').show()

[Stage 215:==================================================>   (94 + 6) / 100]

+-----------------------+-------------------+
|(DAYS_EMPLOYED IS NULL)|         avg(label)|
+-----------------------+-------------------+
|                   true|0.05399646043269404|
|                  false| 0.0865997453765215|
+-----------------------+-------------------+

# Adds a binary variable to flag missing observations.
from pyspark.ml.stat import Correlation, ChiSquareTest

def flag_missing(df, inputCols=None, labelCol='label', alpha=0.05):
    """
    Adds a binary variable to flag missing observations(one indicator per variable). 
    The added variables (missing indicators) are named with the original variable name plus '_missing'.
    
    Parameters:
    ----------
    alpha: float, default=0.05
        Features with correlation more than alpha are selected.
    """
    if inputCols is None:
        inputCols = df.drop(labelCol).columns
    
    for var in inputCols:
        df = df.withColumn(var + "_missing", df[var].isNull().cast('int'))
    
    indicators = [var + "_missing" for var in inputCols]
    # The correlations
    corr = df.select([fn.corr(labelCol, c2).alias(c2) for c2 in indicators])
    corr = corr.fillna(0).first().asDict()
    # find variables for which indicator should be added.
    selected_cols = [var for var, r in corr.items() if abs(r) > alpha]
    drop_cols = [var for var in indicators if var not in selected_cols]
    df = df.drop(*drop_cols)
    print(f"Added {len(selected_cols)} missing indicators")
    return df

print('The number of features:', len(flag_missing(df_encoded).columns))

[Stage 220:>                                                        (0 + 1) / 1]

Added 0 missing indicators
The number of features: 122

pipeline = Pipeline(stages=[
    StringIndexer(
        inputCol="CODE_GENDER", 
        outputCol="indexedCol",
        handleInvalid="keep"
    ),
    OneHotEncoder(
        inputCol="indexedCol", 
        outputCol="encodedCol", 
        handleInvalid="keep",
        dropLast=False
    )
])

pipeline.fit(df).transform(df).select("CODE_GENDER", "encodedCol").show(5)

+-----------+-------------+
|CODE_GENDER|   encodedCol|
+-----------+-------------+
|          M|(4,[1],[1.0])|
|          F|(4,[0],[1.0])|
|          M|(4,[1],[1.0])|
|          F|(4,[0],[1.0])|
|          F|(4,[0],[1.0])|
+-----------+-------------+
only showing top 5 rows

nunique = df_encoded.select([fn.countDistinct(var).alias(var) for var in df_encoded.columns]).first().asDict() 
binary = df_encoded.select([fn.collect_set(var).alias(var) for var,n in nunique.items() if n == 2])
print([k for k, v in binary.first().asDict().items() if set(v) == {0, 1}])

[Stage 231:===================================================>  (96 + 4) / 100]

['label', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']

def impute_manually(df):
    """
    Replaces missing values by an arbitrary value
    """
    # boolean
    boolean_features = ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 
                        'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 
                        'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION',
                        'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 
                        'EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 
                        'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 
                        'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 
                        'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 
                        'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']
    df = df.na.fill(0, subset=boolean_features)
    # fill 0
    features_fill_zero = [
        "OBS_30_CNT_SOCIAL_CIRCLE",  
        "DEF_30_CNT_SOCIAL_CIRCLE",
        "OBS_60_CNT_SOCIAL_CIRCLE",
        "DEF_60_CNT_SOCIAL_CIRCLE",
        "AMT_REQ_CREDIT_BUREAU_HOUR",
        "AMT_REQ_CREDIT_BUREAU_DAY",
        "AMT_REQ_CREDIT_BUREAU_WEEK",
        "AMT_REQ_CREDIT_BUREAU_MON",
        "AMT_REQ_CREDIT_BUREAU_QRT",
        "AMT_REQ_CREDIT_BUREAU_YEAR"
    ]
    df = df.na.fill(0, subset=features_fill_zero)
    
    return df

_ = display_missing(impute_manually(df_encoded))

[Stage 246:===================================================>  (96 + 4) / 100]

Your selected dataframe has 55 out of 122 columns that have missing values.
There are 46 columns with more than 25.0% missing values.

print('AMT_CREDIT :', df.corr('AMT_CREDIT', 'AMT_ANNUITY'))
print('AMT_INCOME_TOTAL :', df.corr('AMT_CREDIT', 'AMT_ANNUITY'))

AMT_CREDIT : 0.7700800319525184

[Stage 252:======================================>               (71 + 8) / 100]

AMT_INCOME_TOTAL : 0.7700800319525184

# conditional statistic completer
class ConditionalMeanCompleter:
    pass

# Univariate imputer for completing missing values with simple strategies.

dtypes = df_encoded.drop("SK_ID_CURR", "TARGET").dtypes
numerical_cols = [k for k, v in dtypes if v not in ('string', 'vector')]
imputed_cols = [f"imputed_{col}" for col in numerical_cols]
imputer = ft.Imputer(
    inputCols=numerical_cols,
    outputCols=imputed_cols,
    strategy="median"
)

_ = display_missing(imputer.fit(df_encoded).transform(df_encoded).select(imputed_cols))

[Stage 263:===================================================>  (96 + 4) / 100]

Your selected dataframe has 0 out of 111 columns that have missing values.
There are 0 columns with more than 25.0% missing values.

# Function for missing value imputation

def handle_missing(df):
    # Remove variables with high missing rate
    df = drop_missing_data(df, threshold=0.2)
    # find variables for which indicator should be added.
    df = flag_missing(df)

    # Replaces missing values by an arbitrary value
    df = impute_manually(df)

    # Univariate imputer for completing missing values with simple strategies.
    dtypes = df.drop("SK_ID_CURR", "TARGET").dtypes
    numerical_cols = [k for k, v in dtypes if v not in ('string', 'vector')]
    imputed_cols = [f"imputed_{col}" for col in numerical_cols]
    imputer = ft.Imputer(
        inputCols=numerical_cols,
        outputCols=imputed_cols,
        strategy="median"
    )
    df = imputer.fit(df).transform(df)
    colsMap = dict(zip(imputed_cols, numerical_cols))
    df = df.drop(*numerical_cols).withColumnsRenamed(colsMap)
    return df

df_imputed = handle_missing(df_encoded)

Removed 0 variables with missing more than 80.0%

Added 0 missing indicators

_ = display_missing(df_imputed)

[Stage 292:===================================================>  (96 + 4) / 100]

Your selected dataframe has 0 out of 122 columns that have missing values.
There are 0 columns with more than 25.0% missing values.

class OutlierCapper(Estimator, Transformer):
    """
    Caps maximum and/or minimum values of a variable at automatically
    determined values.
    Works only with numerical variables. A list of variables can be indicated. 
    
    Parameters
    ----------
    method: str, 'gaussian' or 'iqr', default='iqr'
        If method='gaussian': 
            - upper limit: mean + 3 * std
            - lower limit: mean - 3 * std
        If method='iqr': 
            - upper limit: 75th quantile + 3 * IQR
            - lower limit: 25th quantile - 3 * IQR
            where IQR is the inter-quartile range: 75th quantile - 25th quantile.
    fold: int, default=3   
        You can select how far out to cap the maximum or minimum values.
    """

    def __init__(self, inputCols, method='iqr', fold=3):
        super().__init__()
        self.method = method
        self.fold = fold
        self.inputCols = inputCols

    def _fit(self, df):
        """
        Learn the values that should be used to replace outliers.
        """

        if self.method == "gaussian":
            mean = df.select([fn.mean(var).alias(var) for var in self.inputCols])
            mean = pd.Series(mean.first().asDict())
            bias= [mean, mean]
            scale = df.select([fn.std(var).alias(var) for var in self.inputCols])
            scale = pd.Series(scale.first().asDict())
        elif self.method == "iqr":
            Q1 = df.select([fn.percentile(var, 0.25).alias(var) for var in self.inputCols])
            Q1 = pd.Series(Q1.first().asDict())
            Q3 = df.select([fn.percentile(var, 0.75).alias(var) for var in self.inputCols])
            Q3 = pd.Series(Q3.first().asDict())
            bias = [Q1, Q3]
            scale = Q3 - Q1         
        
        # estimate the end values
        if (scale == 0).any():
            raise ValueError(
                f"Input columns {scale[scale == 0].index.tolist()!r}"
                f" have low variation for method {self.method!r}."
                f" Try other capping methods or drop these columns."
            )
        else:
            self.upper_limit = bias[1] + self.fold * scale
            self.lower_limit = bias[0] - self.fold * scale  

        return self 

    def _transform(self, df):
        """
        Cap the variable values.
        """
        maximum = df.select([fn.max(var).alias(var) for var in self.inputCols])
        maximum = pd.Series(maximum.first().asDict())
        minimum = df.select([fn.min(var).alias(var) for var in self.inputCols])
        minimum = pd.Series(minimum.first().asDict())
        outiers = (maximum.gt(self.upper_limit) | 
                   minimum.lt(self.lower_limit))
        n = outiers.sum()
        print(f"Your selected dataframe has {n} out of {len(self.inputCols)} columns that have outliers.")
        
        # replace outliers
        for var in self.inputCols:
            upper_limit = self.upper_limit[var]
            lower_limit = self.lower_limit[var]
            df = df.withColumn(var, 
                fn.when(df[var] > upper_limit, upper_limit)
                  .when(df[var] < lower_limit, lower_limit)
                  .otherwise(df[var])
            )
        return df

outlier_capper = OutlierCapper(method="gaussian", inputCols=numerical_cols).fit(df_imputed)
df_capped = outlier_capper.transform(df_imputed)

Your selected dataframe has 96 out of 111 columns that have outliers.

from pyspark.ml.feature import RobustScaler

scaler = RobustScaler(inputCol="features", outputCol="scaled")
assembler = VectorAssembler(
    inputCols=['DAYS_EMPLOYED', 'AMT_CREDIT'],
    outputCol="features"
)
pipelineModel = Pipeline(stages=[assembler, scaler]).fit(df_imputed)
pipelineModel.transform(df_imputed).select('scaled').show(5)

+--------------------+
|              scaled|
+--------------------+
|[-0.9644030668127...|
|[-0.5991237677984...|
|[-0.6056955093099...|
|[-0.9036144578313...|
|[-0.9036144578313...|
+--------------------+
only showing top 5 rows

# Check the skew of all numerical features
skewness = df_imputed.select([fn.skewness(var).alias(var) for var in numerical_cols])
skewness = pd.Series(skewness.first().asDict()).sort_values()
print(skewness.head(10))
print(skewness.tail(10))

[Stage 310:===================================================>  (96 + 4) / 100]

FLAG_MOBIL                     -554.534039
FLAG_CONT_MOBILE                -23.081060
YEARS_BEGINEXPLUATATION_MEDI    -21.825280
YEARS_BEGINEXPLUATATION_AVG     -21.744660
YEARS_BEGINEXPLUATATION_MODE    -20.686068
DAYS_EMPLOYED                    -2.295700
YEARS_BUILD_MODE                 -1.889130
YEARS_BUILD_MEDI                 -1.747004
YEARS_BUILD_AVG                  -1.744856
FLAG_EMP_PHONE                   -1.664878
dtype: float64
FLAG_DOCUMENT_20              44.364680
FLAG_DOCUMENT_21              54.612673
FLAG_DOCUMENT_17              61.213842
FLAG_DOCUMENT_7               72.173756
FLAG_DOCUMENT_4              110.893823
AMT_REQ_CREDIT_BUREAU_QRT    141.400225
FLAG_DOCUMENT_2              153.791067
FLAG_DOCUMENT_10             209.588031
AMT_INCOME_TOTAL             391.557744
FLAG_DOCUMENT_12             392.112866
dtype: float64

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import probplot, norm

def norm_comparison_plot(series):
    series = pd.Series(series)
    mu, sigma = norm.fit(series)
    kurt, skew = series.kurt(), series.skew()
    print(f"Kurtosis: {kurt:.2f}", f"Skewness: {skew:.2f}", sep='\t')
    
    fig = plt.figure(figsize=(10, 4))
    # Now plot the distribution
    ax1 = fig.add_subplot(121)
    ax1.set_title('Distribution')
    ax1.set_ylabel('Frequency')
    sns.distplot(series, fit=norm, ax=ax1)
    ax1.legend(['dist','kde','norm'],f'Normal dist. ($\mu=$ {mu:.2f} and $\sigma=$ {sigma:.2f} )', loc='best')
    # Get also the QQ-plot
    ax2 = fig.add_subplot(122)
    probplot(series, plot=plt)

sample = df_imputed.select('AMT_CREDIT').sample(0.1).toPandas()
norm_comparison_plot(sample['AMT_CREDIT'])
plt.show()

Kurtosis: 2.06	Skewness: 1.26

# log-transformation of skewed features.
sample_transformed = df_imputed.select(fn.ln('AMT_CREDIT')).sample(0.1).toPandas()
norm_comparison_plot(sample_transformed.iloc[:, 0])
plt.show()

Kurtosis: -0.27	Skewness: -0.33

df_prepared = df_imputed
print(f'dataset shape: {df_prepared.count(), len(df_prepared.columns)}')
print(pd.Series(dict(df_prepared.dtypes)).value_counts())

[Stage 315:===========================================>          (80 + 8) / 100]

dataset shape: (307511, 122)
double    65
int       45
vector    10
float      2
Name: count, dtype: int64

new_colnames = {c: c.replace('/','or').replace(' ','_').replace(',','_or') for c in df_prepared.columns}
df_prepared = df_prepared.withColumnsRenamed(new_colnames)

def cross_val_score(df, estimator, evaluator, features, numFolds=3, seed=SEED):
    df = df.withColumn('fold', (fn.rand(seed) * numFolds).cast('int'))
    eval_result = []
    # Initialize an empty dataframe to hold feature importances
    feature_importances = pd.DataFrame(index=features)
    for i in range(numFolds):
        train = df.filter(df['fold'] == i)
        valid = df.filter(df['fold'] != i)
        model = estimator.fit(train)
        train_pred = model.transform(train)
        valid_pred = model.transform(valid)
        train_score = evaluator.evaluate(train_pred)
        valid_score = evaluator.evaluate(valid_pred)
        metric = evaluator.getMetricName()
        print(f"[{i}] train's {metric}: {train_score},  valid's {metric}: {valid_score}")
        eval_result.append(valid_score)
        
        fscore = model.get_feature_importances()
        fscore = {name:fscore.get(f'f{k}', 0) for k,name in enumerate(features)}
        feature_importances[f'cv_{i}'] = fscore
    feature_importances['fscore'] = feature_importances.mean(axis=1)
    return eval_result, feature_importances.sort_values('fscore', ascending=False)

def score_dataset(df, inputCols=None, featuresCol=None, labelCol='label', nfold=3):
    assert inputCols is not None or featuresCol is not None
    if featuresCol is None:
        # Assemble the feature columns into a single vector column
        featuresCol = "features"
        assembler = VectorAssembler(
            inputCols=inputCols,
            outputCol=featuresCol
        )
        df = assembler.transform(df)
    # Create an Estimator.
    classifier = SparkXGBClassifier(
        features_col=featuresCol, 
        label_col=labelCol,
        eval_metric='auc',
        scale_pos_weight=11,
        learning_rate=0.015,
        max_depth=8,
        subsample=1.0,
        colsample_bytree=0.35,
        reg_alpha=65,
        reg_lambda=15,
        n_estimators=1200,
        verbosity=0
    ) 
    evaluator = BinaryClassificationEvaluator(labelCol=labelCol, metricName='areaUnderROC')
    # Training with 3-fold CV:
    scores, feature_importances = cross_val_score(
        df=df,
        estimator=classifier, 
        evaluator=evaluator,
        features=inputCols,
        numFolds=nfold
    )
    print(f"cv_agg's valid auc: {np.mean(scores):.4f} +/- {np.std(scores):.5f}")
    return feature_importances

features = df_prepared.drop('SK_ID_CURR', 'label').columns
feature_importances = score_dataset(df_prepared, inputCols=features)

2024-06-01 11:38:51,880 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 1 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.35, 'device': 'cpu', 'learning_rate': 0.015, 'max_depth': 8, 'reg_alpha': 65, 'reg_lambda': 15, 'scale_pos_weight': 11, 'subsample': 1.0, 'verbosity': 0, 'eval_metric': 'auc', 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 1200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
[11:39:24] task 0 got new rank 0                                    (0 + 1) / 1]
2024-06-01 11:40:03,876 INFO XGBoost-PySpark: _fit Finished xgboost training!   
INFO:XGBoost-PySpark:Do the inference on the CPUs                 (0 + 8) / 100]
INFO:XGBoost-PySpark:Do the inference on the CPUs                 (1 + 8) / 100]
2024-06-01 11:40:09,675 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:09,695 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:09,751 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:09,758 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:09,769 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:09,777 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:09,787 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs                 (9 + 8) / 100]
2024-06-01 11:40:11,844 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:11,866 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:11,922 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:11,923 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:11,927 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:11,928 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:11,957 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs                (17 + 8) / 100]
2024-06-01 11:40:14,155 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:14,162 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:14,183 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:14,189 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:14,215 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:14,215 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:14,238 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:16,384 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:40:16,410 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:16,425 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:16,431 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:16,451 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:16,514 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:16,517 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs                (33 + 8) / 100]
2024-06-01 11:40:18,783 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:18,795 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:18,822 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:18,827 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:18,858 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:18,909 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:19,119 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:21,712 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:21,719 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:21,737 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:40:21,750 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:21,817 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:21,898 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:22,010 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:24,344 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:24,345 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:40:24,376 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:24,376 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:24,418 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:24,704 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:24,767 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:27,248 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:27,288 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:40:27,328 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:27,328 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:27,379 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:27,426 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:27,427 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:29,949 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:40:30,050 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:30,061 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:30,130 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:30,130 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:30,167 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:30,222 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs=>              (73 + 8) / 100]
2024-06-01 11:40:32,539 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:32,566 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:32,579 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:32,679 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:32,751 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:32,803 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:32,903 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:35,195 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:40:35,237 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:35,319 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:35,341 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:35,371 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:35,376 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:35,695 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:37,704 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:40:37,751 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:37,763 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:37,763 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:37,772 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:37,780 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:37,848 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs==============> (97 + 3) / 100]
2024-06-01 11:40:39,031 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:39,035 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:46,020 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:46,021 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:46,044 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:46,045 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:46,047 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:40:46,068 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:46,075 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:48,524 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:48,547 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:40:48,561 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:48,567 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:48,583 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:48,601 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:48,610 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:50,895 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:50,905 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:50,930 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:50,934 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:50,941 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:50,955 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:40:50,987 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:53,215 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:53,263 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:40:53,276 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:53,288 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:53,289 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:53,296 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:53,319 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:55,752 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:55,751 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:55,786 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:40:55,798 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:55,819 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:55,819 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:55,821 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:58,012 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:58,046 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:40:58,083 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:58,098 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:58,118 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:58,118 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:40:58,120 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs                (48 + 8) / 100]
2024-06-01 11:41:00,361 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:00,414 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:00,415 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:00,416 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:00,416 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:00,412 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:00,420 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:02,655 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:02,681 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:02,684 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:41:02,706 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:02,714 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:02,728 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:02,745 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:05,100 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:05,111 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:05,128 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:05,130 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:05,131 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:05,133 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:05,148 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:41:07,342 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:41:07,363 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:07,366 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:07,377 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:07,378 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:07,381 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:07,402 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs=====>          (80 + 8) / 100]
2024-06-01 11:41:09,716 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:09,722 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:09,737 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:09,756 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:09,766 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:09,768 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:09,770 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs=========>      (88 + 8) / 100]
2024-06-01 11:41:12,061 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:12,070 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:12,089 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:12,090 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:12,107 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:12,106 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:12,112 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs=============>  (96 + 4) / 100]
2024-06-01 11:41:13,516 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:13,519 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:41:13,519 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs

[0] train's areaUnderROC: 0.8817445932752518,  valid's areaUnderROC: 0.7567778599507636

2024-06-01 11:41:21,034 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 1 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.35, 'device': 'cpu', 'learning_rate': 0.015, 'max_depth': 8, 'reg_alpha': 65, 'reg_lambda': 15, 'scale_pos_weight': 11, 'subsample': 1.0, 'verbosity': 0, 'eval_metric': 'auc', 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 1200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
[11:41:45] task 0 got new rank 0                                    (0 + 1) / 1]
2024-06-01 11:42:23,393 INFO XGBoost-PySpark: _fit Finished xgboost training!   
2024-06-01 11:42:26,570 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:42:27,877 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:42:28,600 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:28,685 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:28,684 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:28,714 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:28,716 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:28,719 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:28,731 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:29,953 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:42:30,747 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:30,784 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:30,825 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:30,827 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:30,842 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:30,852 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:30,855 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:31,846 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:42:32,971 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:32,975 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:32,989 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:32,991 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:32,996 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:32,998 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:33,000 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:33,993 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:42:35,070 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:35,095 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:35,117 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:35,125 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:35,144 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:35,141 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:35,160 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:35,957 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:42:37,520 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:37,539 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:37,590 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:37,597 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:37,626 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:37,627 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:37,654 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:37,942 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:42:39,737 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:39,772 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:39,780 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:39,817 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:39,848 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:39,849 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:39,851 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:39,897 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:42:42,128 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:42,140 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:42,153 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:42,174 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:42:42,174 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:42,181 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:42,193 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:42,198 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:44,295 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:44,306 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:44,386 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:44,386 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:44,483 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:44,487 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:44,510 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:42:44,510 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:46,686 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:46,697 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:46,753 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:46,829 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:46,892 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:46,895 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:42:46,933 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:46,971 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:48,903 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:48,960 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:49,104 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:49,113 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:49,205 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:42:49,227 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:49,284 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:49,300 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:51,289 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:51,322 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:51,572 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:51,615 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:42:51,618 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:51,631 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:51,646 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:51,679 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:53,469 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:53,485 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:53,789 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:53,812 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:42:53,838 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:53,839 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:53,846 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:53,934 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:54,858 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:42:54,859 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:00,864 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:00,864 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:00,864 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:00,869 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:00,871 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:00,872 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:43:00,882 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:00,880 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:03,520 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:03,529 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:03,564 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:03,563 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:43:03,592 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:03,598 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:03,601 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:03,601 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:05,804 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:05,837 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:05,838 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:05,839 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:05,842 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:43:05,865 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:05,871 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:05,905 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:08,121 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:08,119 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:08,143 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:08,167 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:08,176 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:08,187 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:43:08,204 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:08,211 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:10,441 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:10,492 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:10,543 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:10,544 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:10,546 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:43:10,549 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:10,550 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:10,550 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:12,970 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:12,999 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:13,004 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:13,007 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:13,026 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:13,033 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:43:13,037 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:13,070 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:15,325 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:43:15,335 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:15,353 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:15,396 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:15,396 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:15,398 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:15,473 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:15,484 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:17,867 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:17,869 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:17,870 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:17,906 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:17,906 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:17,909 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:43:17,953 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:17,955 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:20,241 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:20,247 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:43:20,248 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:20,267 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:20,270 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:20,272 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:20,278 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:20,281 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:22,655 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:22,656 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:22,657 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:43:22,672 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:22,673 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:22,674 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:22,687 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:22,692 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:25,083 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:25,096 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:43:25,114 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:25,128 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:25,164 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:25,165 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:25,176 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:25,183 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:27,637 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:43:27,641 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:27,641 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:27,653 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:27,664 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:27,690 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:27,705 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:27,771 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:29,218 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:29,218 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:43:29,230 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:43:29,237 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs

[1] train's areaUnderROC: 0.8858137153416724,  valid's areaUnderROC: 0.754088602137405

2024-06-01 11:43:36,723 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 1 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.35, 'device': 'cpu', 'learning_rate': 0.015, 'max_depth': 8, 'reg_alpha': 65, 'reg_lambda': 15, 'scale_pos_weight': 11, 'subsample': 1.0, 'verbosity': 0, 'eval_metric': 'auc', 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 1200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
[11:44:01] task 0 got new rank 0                                    (0 + 1) / 1]
2024-06-01 11:44:39,088 INFO XGBoost-PySpark: _fit Finished xgboost training!   
2024-06-01 11:44:42,412 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:44:43,747 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:44:44,445 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:44,457 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:44,499 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:44,508 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:44,508 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:44,527 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:44,571 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:45,933 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:44:46,817 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:46,843 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:46,852 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:46,853 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:46,883 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:46,907 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:46,909 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:48,069 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:44:49,068 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:49,094 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:49,179 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:49,187 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:49,237 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:49,250 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:49,279 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:50,453 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:44:51,386 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:51,443 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:51,445 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:51,446 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:51,457 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:51,469 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:51,526 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:52,672 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:44:53,703 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:53,709 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:53,724 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:53,749 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:53,771 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:53,786 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:53,790 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:54,816 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:44:56,078 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:56,079 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:56,084 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:56,105 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:56,118 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:56,120 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:56,122 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:56,963 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:44:58,347 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:58,352 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:58,353 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:58,370 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:58,370 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:58,371 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:58,371 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:44:59,181 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:00,819 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:00,829 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:00,845 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:00,853 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:00,856 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:00,860 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:00,881 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:01,778 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:03,143 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:03,158 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:03,213 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:03,215 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:03,226 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:03,253 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:03,276 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:04,057 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:05,578 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:05,602 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:05,608 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:05,618 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:05,621 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:05,632 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:05,639 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:06,290 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:07,793 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:07,816 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:07,818 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:07,837 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:07,868 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:07,871 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:07,873 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:08,370 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:10,034 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:10,073 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:10,095 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:10,116 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:10,123 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:10,123 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:10,155 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:10,451 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:12,666 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:12,690 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:18,892 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:18,897 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:18,908 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:18,919 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:18,922 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:18,921 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:18,930 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:18,961 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:21,335 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:21,356 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:21,358 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:21,363 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:21,370 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:21,372 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:21,375 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:21,378 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:23,616 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:23,699 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:23,736 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:23,744 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:23,746 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:23,747 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:23,755 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:23,761 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:26,020 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:26,065 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:26,067 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:26,078 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:26,095 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:26,095 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:26,097 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:26,125 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:28,411 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:28,437 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:28,459 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:28,464 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:28,478 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:28,485 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:28,491 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:28,497 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:30,691 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:30,785 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:30,823 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:30,848 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:30,854 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:30,862 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:30,878 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:30,883 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:32,936 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:32,957 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:33,052 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:33,053 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:33,103 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:33,124 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:33,126 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:33,141 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:35,336 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:35,336 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:35,337 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:35,356 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:35,419 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:35,483 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:35,487 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:35,488 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:37,608 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:37,927 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:38,005 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:38,077 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:38,168 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:38,323 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:38,346 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:38,503 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:40,211 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:40,450 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:40,451 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:40,763 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:40,819 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:40,901 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:40,926 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:40,934 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:42,685 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:42,859 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:42,862 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:43,395 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:43,395 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:43,419 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:43,429 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:43,432 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:45,182 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:45,185 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:45,208 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:45,763 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:45,765 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:45,784 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:45,790 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:45,823 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:46,934 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:46,937 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-06-01 11:45:46,943 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-06-01 11:45:47,125 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
[Stage 401:======================================>               (72 + 8) / 100]

[2] train's areaUnderROC: 0.8830645318540977,  valid's areaUnderROC: 0.755218312522418
cv_agg's valid auc: 0.7554 +/- 0.00110

df_prepared.write.bucketBy(100, "SK_ID_CURR").mode("overwrite").saveAsTable("home_credit_default_risk.prepared_data")

feature_importances['fscore'].head(15)

NONLIVINGAPARTMENTS_MEDI        4420.333333
NONLIVINGAREA_MEDI              4300.666667
YEARS_BEGINEXPLUATATION_MODE    4240.000000
COMMONAREA_MODE                 4098.666667
ELEVATORS_MODE                  4023.666667
NONLIVINGAPARTMENTS_AVG         3947.000000
LIVINGAREA_AVG                  3862.666667
YEARS_BUILD_MODE                3781.000000
NONLIVINGAREA_AVG               3455.333333
LIVINGAREA_MEDI                 3313.666667
BASEMENTAREA_MODE               3160.666667
LIVINGAPARTMENTS_AVG            2819.333333
LIVINGAPARTMENTS_MEDI           2635.000000
YEARS_BUILD_MEDI                2312.666667
ENTRANCES_MODE                  1947.666667
Name: fscore, dtype: float64

spark.stop()

	encoded_NAME_CONTRACT_TYPE	encoded_CODE_GENDER	encoded_FLAG_OWN_CAR	encoded_FLAG_OWN_REALTY	encoded_NAME_TYPE_SUITE	encoded_NAME_INCOME_TYPE	encoded_NAME_FAMILY_STATUS	encoded_NAME_HOUSING_TYPE	encoded_WEEKDAY_APPR_PROCESS_START	encoded_FONDKAPREMONT_MODE	encoded_HOUSETYPE_MODE	encoded_WALLSMATERIAL_MODE	encoded_EMERGENCYSTATE_MODE
0	(0.0, 1.0)	(0.0, 1.0)	(0.0, 1.0)	(1.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 0.0, 1.0)	(1.0, 0.0, 0.0)	(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(1.0, 0.0)
1	(1.0, 0.0)	(1.0, 0.0)	(1.0, 0.0)	(1.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 0.0, 1.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0)	(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)	(1.0, 0.0)
2	(1.0, 0.0)	(0.0, 1.0)	(0.0, 1.0)	(1.0, 0.0)	(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(1.0, 0.0)
3	(1.0, 0.0)	(1.0, 0.0)	(1.0, 0.0)	(0.0, 1.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 0.0)	(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(0.0, 0.0)
4	(1.0, 0.0)	(1.0, 0.0)	(1.0, 0.0)	(1.0, 0.0)	(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 0.0, 1.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 0.0)	(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(0.0, 0.0)

	NAME_CONTRACT_TYPE	CODE_GENDER	NAME_TYPE_SUITE	NAME_INCOME_TYPE	NAME_EDUCATION_TYPE	NAME_FAMILY_STATUS	NAME_HOUSING_TYPE	OCCUPATION_TYPE	WEEKDAY_APPR_PROCESS_START	ORGANIZATION_TYPE	FONDKAPREMONT_MODE	HOUSETYPE_MODE	WALLSMATERIAL_MODE
0	(0.0, 1.0)	(0.0, 1.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)	3	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	0.062141	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)	0.092996	(0.0, 0.0, 0.0, 1.0)	(1.0, 0.0, 0.0)	(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)
1	(1.0, 0.0)	(1.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)	2	(0.0, 0.0, 0.0, 1.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	0.096317	(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)	0.094494	(1.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0)	(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)
2	(1.0, 0.0)	(0.0, 1.0)	(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)	1	(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	0.113259	(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)	0.101738	(0.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
3	(1.0, 0.0)	(1.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	1	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	NaN	(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)	NaN	(0.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 0.0)	(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
4	(1.0, 0.0)	(1.0, 0.0)	(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	1	(0.0, 0.0, 0.0, 1.0, 0.0, 0.0)	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)	NaN	(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)	NaN	(0.0, 0.0, 0.0, 0.0)	(0.0, 0.0, 0.0)	(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

pyspark.ml.feature	标准化
StandardScaler(withMean, withStd, …)	是一个`Estimator`。z-scoe标准化
Normalizer(p, inputCol, outputCol)	是一个`Transformer`。该方法使用p范数将数据缩放为单位范数（默认为L2）
MaxAbsScaler(inputCol, outputCol)	是一个`Estimator`。将数据标准化到`[-1, 1]`范围内
MinMaxScaler(min, max, inputCol, outputCol)	是一个`Estimator`。将数据标准化到`[0, 1]`范围内
RobustScaler(lower, upper, …)	是一个`Estimator`。根据分位数缩放数据

数据预处理¶

探索性数据分析¶

数据清洗¶

数据去重¶

数据类型转换¶

错误数据清洗¶

布尔特征清洗¶

函数封装¶

特征重编码¶

顺序编码¶

平均数编码¶

哑变量编码¶

连续特征分箱¶

函数封装¶

缺失值处理¶

缺失值统计¶

缺失值删除¶

缺失值标记¶

人工插补¶

条件平均值填充法¶

简单插补¶

函数封装¶

异常值检测¶

标准化/归一化¶

正态变换¶

偏度¶

QQ图¶

非线性变换¶

Baseline¶

交叉验证¶

特征重要性¶

	SK_ID_CURR	NAME_CONTRACT_TYPE	CODE_GENDER	FLAG_OWN_CAR	FLAG_OWN_REALTY	CNT_CHILDREN	AMT_INCOME_TOTAL	AMT_CREDIT	AMT_ANNUITY	...	AMT_REQ_CREDIT_BUREAU_MON	AMT_REQ_CREDIT_BUREAU_QRT	AMT_REQ_CREDIT_BUREAU_YEAR
0	191480	Cash loans	M	Y	N	0	157500.0	342000.0	17590.5	...	1.0	0.0	7.0
1	191502	Cash loans	F	N	Y	0	108000.0	324000.0	20704.5	...	0.0	0.0	0.0
2	191673	Cash loans	F	Y	Y	0	135000.0	1323000.0	36513.0	...	1.0	0.0	2.0
3	191877	Cash loans	F	N	Y	2	45000.0	47970.0	5296.5	...	0.0	0.0	4.0
4	192108	Cash loans	F	N	Y	0	315000.0	263686.5	13522.5	...	5.0	2.0	3.0

	summary	SK_ID_CURR	TARGET	NAME_CONTRACT_TYPE	CODE_GENDER	FLAG_OWN_CAR	FLAG_OWN_REALTY	CNT_CHILDREN	AMT_INCOME_TOTAL	AMT_CREDIT	...	FLAG_DOCUMENT_18	FLAG_DOCUMENT_19	FLAG_DOCUMENT_20	FLAG_DOCUMENT_21	AMT_REQ_CREDIT_BUREAU_HOUR	AMT_REQ_CREDIT_BUREAU_DAY	AMT_REQ_CREDIT_BUREAU_WEEK	AMT_REQ_CREDIT_BUREAU_MON	AMT_REQ_CREDIT_BUREAU_QRT	AMT_REQ_CREDIT_BUREAU_YEAR
0	count	307511	307511	307511	307511	307511	307511	307511	307511	307511	...	307511	307511	307511	307511	265992	265992	265992	265992	265992	265992
1	mean	278180.51857657125	0.08072881945686496	None	None	None	None	0.4170517477423572	168797.91929698447	599025.9997057016	...	0.008129790479039774	5.951006630657115E-4	5.072989258920819E-4	3.349473677364387E-4	0.006402448193930645	0.0070002105326475985	0.0343619356973142	0.26739526000781977	0.26547414959848414	1.899974435321363
2	stddev	102790.17534842461	0.2724186456483938	None	None	None	None	0.722121384437625	237123.14627885612	402490.776995855	...	0.0897982361093956	0.024387465065862264	0.022517620268446132	0.01829853182243764	0.08384912844747726	0.11075740632435459	0.20468487581282443	0.9160023961526171	0.7940556483207575	1.8692949981815559
3	min	100002	0	Cash loans	F	N	N	0	25650.0	45000.0	...	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0
4	25%	189124	0	None	None	None	None	0	112500.0	270000.0	...	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0
5	50%	278173	0	None	None	None	None	0	146250.0	513531.0	...	0	0	0	0	0.0	0.0	0.0	0.0	0.0	1.0
6	75%	367118	0	None	None	None	None	1	202500.0	808650.0	...	0	0	0	0	0.0	0.0	0.0	0.0	0.0	3.0
7	max	456255	1	Revolving loans	XNA	Y	Y	19	1.17E8	4050000.0	...	1	1	1	1	4.0	9.0	8.0	27.0	261.0	25.0