Decision Tree, Random Forest and Extra Tree

2023. 5. 9. 17:54

Decision Tree

Overview

Artificial Intelligence Learning Point: Decision Path Rule until Leaf Node

import pandas as pd
from sklearn.datasets import make_classification, make_regression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
import graphviz

X, y = make_regression(n_samples=500, n_features=10, n_informative=5, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, random_state=None)
model = DecisionTreeRegressor(criterion='squared_error', max_depth=3, min_samples_split=200, min_samples_leaf=100)
model.fit(X, y)
model.predict(X)

X, y = make_classification(n_samples=500, n_features=10, n_classes=3, n_clusters_per_class=1, weights=[.6, .3, .1], flip_y=0)
model = DecisionTreeClassifier(criterion='gini', min_impurity_decrease=0.01)
model.fit(X, y)
model.predict(X)
model.predict_proba(X)

dot_data=export_graphviz(model,out_file=None, filled=True, rounded=True, special_characters=True,
                         feature_names=list(map(lambda x: 'X'+str(x), range(X.shape[1]))),
                         class_names= pd.unique(y).astype(str).tolist())
display(graphviz.Source(dot_data))

Entropy and Information Gain

Entropy: $${\displaystyle \mathrm {H} (T)=\operatorname {I} _{E}\left(p_{1},p_{2},\ldots ,p_{J}\right)=-\sum _{i=1}^{J}p_{i}\log _{2}p_{i}}$$ Information Gain: $${\displaystyle \overbrace {IG(T,a)} ^{\text{information gain}}=\overbrace {\mathrm {H} (T)} ^{\text{entropy (parent)}}-\overbrace {\mathrm {H} (T\mid a)} ^{\text{sum of entropies (children)}}}$$ $${\displaystyle =-\sum _{i=1}^{J}p_{i}\log _{2}p_{i}-\sum _{i=1}^{J}-\Pr(i\mid a)\log _{2}\Pr(i\mid a)}$$ Expected Information Gain: $${\displaystyle \overbrace {E_{A}(\operatorname {IG} (T,a))} ^{\text{expected information gain}}=\overbrace {I(T;A)} ^{{\text{mutual information between }}T{\text{ and }}A}=\overbrace {\mathrm {H} (T)} ^{\text{entropy (parent)}}-\overbrace {\mathrm {H} (T\mid A)} ^{\text{weighted sum of entropies (children)}}}$$ $${\displaystyle =-\sum _{i=1}^{J}p_{i}\log _{2}p_{i}-\sum _{a}p(a)\sum _{i=1}^{J}-\Pr(i\mid a)\log _{2}\Pr(i\mid a)}$$ $${\displaystyle {\mathrm {H} (T\mid A)}=\sum _{a}p(a)\sum _{i=1}^{J}-\Pr(i\mid a)\log _{2}\Pr(i\mid a)}$$

import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
np.random.seed(0)

def decision_entropy(frame):
    frequency = frame['Y'].value_counts()
    probability = frequency / frequency.sum()
    entropy = (- probability * probability.apply(np.log2)).sum()
    return entropy

def child_entropy(frame, column):
    probability = frame[column].value_counts()/frame[column].value_counts().sum()
    entropies = pd.Series(dict(map(lambda instance: (instance, decision_entropy(frame.loc[lambda x: x[column] == instance, :])), frame[column].unique())))
    entropy = (probability * entropies).sum()
    return entropy

df = pd.DataFrame(data=np.c_[np.random.randint(0, 10, size=(100,3)), np.random.randint(0, 4, size=(100,1))], columns=['X0', 'X1', 'X2', 'Y'])

parent_entropy = decision_entropy(df)
children_entropy = dict(map(lambda column: (column, child_entropy(df, column)), ['X0', 'X1', 'X2']))
information_gain = pd.Series(dict(map(lambda x: (x[0], parent_entropy - x[1]), children_entropy.items())))
spliter_column = information_gain[information_gain.rank(ascending=False) == 1].index[0]

probability = df[spliter_column].value_counts()/df[spliter_column].value_counts().sum()
entropies = pd.Series(dict(map(lambda instance: (instance, decision_entropy(df.loc[lambda x: x[spliter_column] == instance, :])), df[spliter_column].unique())))
spliter_instance = entropies[entropies.rank(ascending=True) == 1].index[0]

df_left = df.loc[lambda x: x[spliter_column] == spliter_instance]
df_right = df.loc[lambda x: x[spliter_column] != spliter_instance]


classifier = DecisionTreeClassifier(criterion='entropy')
classifier.fit(df[['X0', 'X1', 'X2']], df['Y'])

decision_entropy(df)       # classifier.tree_.impurity[0]
decision_entropy(df_left)  # classifier.tree_.impurity[1]
decision_entropy(df_right) 
classifier.tree_.impurity

INDEX	X0	X1	X2	Y
0	5	0	3	0
1	3	7	9	2
2	3	5	2	2
3	4	7	6	2
4	8	8	1	0
...	...	...	...	...
95	8	8	7	0
96	0	3	8	0
97	7	7	1	0
98	8	4	7	0
99	0	4	9	1

Task: Classification

Validation: DecisionTreeClassifier: binary classification

import joblib
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc
from sklearn.tree import DecisionTreeClassifier


X, y = make_classification(n_samples=3000, n_features=10, n_classes=2, n_clusters_per_class=1, weights=[0.6, 0.4], flip_y=0)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=None) # cross validation & randomness control

#binary_class_scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
#multi_class_scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
classifier = make_pipeline(PowerTransformer(method='yeo-johnson', standardize=True), Normalizer(), DecisionTreeClassifier())
classifier = GridSearchCV(
    estimator=classifier, cv=cv,
    scoring=['accuracy', 'recall', 'precision', 'f1'][0], 
    param_grid={
        'decisiontreeclassifier__splitter':["gini", "entropy", "log_loss"], 
        'decisiontreeclassifier__splitter':["best", "random"], 
#        'decisiontreeclassifier__max_depth':[None, 5, 10, 20],
#        'decisiontreeclassifier__min_samples_split':[2, 5, 10, 20],
        'decisiontreeclassifier__min_samples_leaf':[1, 5, 10, 20],
        'decisiontreeclassifier__min_weight_fraction_leaf':[0, .1],
#        'decisiontreeclassifier__max_features':["auto", "sqrt", "log2"],
#        'decisiontreeclassifier__random_state':[None, 0],
#        'decisiontreeclassifier__max_leaf_nodes':[None],
#        'decisiontreeclassifier__min_impurity_decrease':[0, .1],
#        'decisiontreeclassifier__ccp_alpha':[0, .1],
    }, 
    return_train_score=True
)
classifier.fit(X, y); joblib.dump(classifier, 'classifier.joblib')
classifier = joblib.load('classifier.joblib')
classifier.cv_results_


# Evaluation
train_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('train_score') , classifier.cv_results_.items())))
train_scores = train_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=train_scores[0].to_dict())
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train_score', column.replace('_train_score', '')), train_scores.columns))

test_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('test_score') , classifier.cv_results_.items())))
test_scores = test_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=test_scores[0].to_dict())
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test_score', column.replace('_test_score', '')), test_scores.columns))

time_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('time') , classifier.cv_results_.items())))
time_scores = time_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=time_scores[0].to_dict())
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', column.replace('_time', '')), time_scores.columns))

scores = pd.concat([train_scores, test_scores, time_scores], axis=1)
scores.index = pd.MultiIndex.from_frame(pd.DataFrame(classifier.cv_results_['params']))
scores.sort_values(('test_score', 'rank'))

Preprocessing effect: DecisionTreeClassifier: binary classification

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import OneHotEncoder, Binarizer, KBinsDiscretizer, PolynomialFeatures, SplineTransformer

def scoring(classifier, X, y, preprocessor_name, task_type, random_state=None):
    from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold, RepeatedKFold
    
    if task_type == 'binary':
        scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=random_state) # StratifiedKFold(n_splits=5, shuffle=False, random_state=None)  # cross validation & randomness control
    elif task_type == 'multi':
        scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=random_state) # StratifiedKFold(n_splits=5, shuffle=False, random_state=None)  # cross validation & randomness control
    elif task_type == 'reg':
        scoring = ['r2', 'explained_variance', 'max_error', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error', 'neg_median_absolute_error', 'neg_mean_absolute_percentage_error']
        cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=random_state) # KFold(n_splits=5, shuffle=False, random_state=None)
    
    scores = pd.DataFrame(cross_validate(classifier, X, y, cv=cv, scoring=scoring, return_train_score=True)).mean()
    scores.name = preprocessor_name
    return scores

def scoring_summary(scores):
    # summary
    train_scores = scores[list(filter(lambda column: column.startswith('train'), scores.columns))]
    test_scores = scores[list(filter(lambda column: column.startswith('test'), scores.columns))]
    time_scores = scores[list(filter(lambda column: column.endswith('time'), scores.columns))]
    train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train', '_'.join(column.split('_')[1:])), train_scores.columns))
    test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test', '_'.join(column.split('_')[1:])), test_scores.columns))
    time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', '_'.join(column.split('_')[:-1])), time_scores.columns))
    scores = pd.concat([train_scores, test_scores, time_scores], axis=1).swaplevel(0,1,axis=1)
    return scores 

random_state = None; task_type = 'binary'
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, weights=[0.6, 0.4], flip_y=0, random_state=random_state)


# baseline
scores = list()
scores.append(scoring(DecisionTreeClassifier(), X, y, preprocessor_name='baseline', task_type=task_type, random_state=random_state))

# transform of measure
scores.append(scoring(DecisionTreeClassifier(), PCA(n_components=None).fit_transform(X), y, preprocessor_name='PCA', task_type=task_type, random_state=random_state))
scores.append(scoring(DecisionTreeClassifier(), FactorAnalysis(n_components=None, rotation='varimax').fit_transform(X), y, preprocessor_name='FactorAnalysis', task_type=task_type, random_state=random_state))
scores.append(scoring(DecisionTreeClassifier(), QuantileTransformer(output_distribution='normal').fit_transform(X), y, preprocessor_name='QuantileTransformer', task_type=task_type, random_state=random_state))
scores.append(scoring(DecisionTreeClassifier(), PowerTransformer(method='yeo-johnson', standardize=True).fit_transform(X), y, preprocessor_name='PowerTransform', task_type=task_type, random_state=random_state))
scores.append(scoring(DecisionTreeClassifier(), Normalizer().fit_transform(X), y, preprocessor_name='Normalizer', task_type=task_type, random_state=random_state))
scores.append(scoring(DecisionTreeClassifier(), StandardScaler().fit_transform(X), y, preprocessor_name='StandardScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(DecisionTreeClassifier(), MinMaxScaler().fit_transform(X), y, preprocessor_name='MinMaxScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(DecisionTreeClassifier(), MaxAbsScaler().fit_transform(X), y, preprocessor_name='MaxAbsScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(DecisionTreeClassifier(), RobustScaler().fit_transform(X), y, preprocessor_name='RobustScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(DecisionTreeClassifier(), KBinsDiscretizer(n_bins=[3]*10, encode='ordinal').fit_transform(X), y, preprocessor_name='KBinsDiscretizer', task_type=task_type, random_state=random_state))

# transform of sigma-algebra
scores.append(scoring(DecisionTreeClassifier(), Binarizer(threshold=0).fit_transform(X), y, preprocessor_name='Binarizer', task_type=task_type, random_state=random_state))
scores.append(scoring(DecisionTreeClassifier(), PolynomialFeatures(degree=2, interaction_only=False, include_bias=True).fit_transform(X), y, preprocessor_name='PolynomialFeatures', task_type=task_type, random_state=random_state))
scores.append(scoring(DecisionTreeClassifier(), SplineTransformer(degree=2, n_knots=3).fit_transform(X), y, preprocessor_name='SplineTransformer', task_type=task_type, random_state=random_state))
scores = pd.concat(scores, axis=1).T

# summary
scores = scoring_summary(scores)
scores[['accuracy', 'recall', 'precision', 'f1']]

Task: Regression

Validation: DecisionTreeRegressor

# https://scikit-learn.org/stable/modules/model_evaluation.html

import joblib
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, KFold
from sklearn.tree import DecisionTreeRegressor

X, y = make_regression(n_samples=3000, n_features=10, n_informative=5, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, random_state=None)

cv = KFold(n_splits=10, shuffle=False, random_state=None)
regressor = make_pipeline(PowerTransformer(method='yeo-johnson', standardize=True), Normalizer(), DecisionTreeRegressor())
regressor = GridSearchCV(
    estimator=regressor, cv=cv, 
    scoring=['r2', 'explained_variance', 'max_error', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error', 'neg_median_absolute_error', 'neg_mean_absolute_percentage_error'][1],    
    param_grid={
        'decisiontreeregressor__splitter':["best", "random"], 
#        'decisiontreeregressor__max_depth':[None, 5, 10, 20],
#        'decisiontreeregressor__min_samples_split':[2, 5, 10, 20],
#        'decisiontreeregressor__min_samples_leaf':[1, 5, 10, 20],
#        'decisiontreeregressor__min_weight_fraction_leaf':[0, .1],
#        'decisiontreeregressor__max_features':["auto", "sqrt", "log2"],
#        'decisiontreeregressor__random_state':[None, 0],
#        'decisiontreeregressor__max_leaf_nodes':[None],
#        'decisiontreeregressor__min_impurity_decrease':[0, .1],
#        'decisiontreeregressor__ccp_alpha':[0, .1],
    }, 
    return_train_score=True)
regressor.fit(X, y); joblib.dump(regressor, 'regressor.joblib')
regressor = joblib.load('regressor.joblib')

# Evaluation
train_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('train_score') , regressor.cv_results_.items())))
train_scores = train_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=train_scores[0].to_dict())
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train_score', column.replace('_train_score', '')), train_scores.columns))

test_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('test_score') , regressor.cv_results_.items())))
test_scores = test_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=test_scores[0].to_dict())
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test_score', column.replace('_test_score', '')), test_scores.columns))

time_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('time') , regressor.cv_results_.items())))
time_scores = time_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=time_scores[0].to_dict())
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', column.replace('_time', '')), time_scores.columns))

scores = pd.concat([train_scores, test_scores, time_scores], axis=1)
scores.index = pd.MultiIndex.from_frame(pd.DataFrame(regressor.cv_results_['params']))
scores.sort_values(('test_score', 'rank'))

Preprocessing effect: DecisionTreeRegressor: regression

Random Forest

Overview

from sklearn.datasets import make_classification, make_regression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

X, y = make_classification(n_samples=3000, n_features=10, n_classes=3, n_clusters_per_class=1, weights=[.6, .3, .1], flip_y=0, random_state=None)
classifier = RandomForestClassifier(n_estimators=10, max_features='sqrt', criterion='gini', random_state=0)
classifier.fit(X, y)
classifier.predict(X)
classifier.predict_proba(X)

X, y = make_regression(n_samples=3000, n_features=10, n_informative=5, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, random_state=None)
regressor = RandomForestRegressor(n_estimators=10, max_features='sqrt', criterion='squared_error', random_state=0)
regressor.fit(X, y)
regressor.predict(X)

Task: Classification

Validation: RandomForestClassifier: binary classification

import joblib
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc
from sklearn.ensemble import RandomForestClassifier


X, y = make_classification(n_samples=3000, n_features=10, n_classes=2, n_clusters_per_class=1, weights=[0.6, 0.4], flip_y=0)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=None) # cross validation & randomness control

#binary_class_scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
#multi_class_scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
classifier = make_pipeline(PowerTransformer(method='yeo-johnson', standardize=True), Normalizer(), RandomForestClassifier())
classifier = GridSearchCV(
    estimator=classifier, cv=cv,
    scoring=['accuracy', 'recall', 'precision', 'f1'][0], 
    param_grid={
    'randomforestclassifier__n_estimators':[100], 
    'randomforestclassifier__criterion':["gini", "entropy", "log_loss"], 
    'randomforestclassifier__max_depth':[None, 5, 10, 20],
    'randomforestclassifier__min_samples_split':[2, 5, 10],
    'randomforestclassifier__min_samples_leaf':[1, 10, 20],
    'randomforestclassifier__min_weight_fraction_leaf':[0.0, .1],
#    'randomforestclassifier__max_features':["sqrt", "log2", None],
#    'randomforestclassifier__max_leaf_nodes':[None],
#    'randomforestclassifier__min_impurity_decrease':[0.0, .1], 
#    'randomforestclassifier__bootstrap':[True],
#    'randomforestclassifier__oob_score':[False],
#    'randomforestclassifier__n_jobs':[None],
#    'randomforestclassifier__random_state':[None],
#    'randomforestclassifier__warm_start':[False],
#    'randomforestclassifier__class_weight':[None],
#    'randomforestclassifier__ccp_alpha':[0.0],
#    'randomforestclassifier__max_samples':[None],
    }, 
    return_train_score=True
)
classifier.fit(X, y); joblib.dump(classifier, 'classifier.joblib')
classifier = joblib.load('classifier.joblib')
classifier.cv_results_


# Evaluation
train_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('train_score') , classifier.cv_results_.items())))
train_scores = train_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=train_scores[0].to_dict())
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train_score', column.replace('_train_score', '')), train_scores.columns))

test_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('test_score') , classifier.cv_results_.items())))
test_scores = test_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=test_scores[0].to_dict())
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test_score', column.replace('_test_score', '')), test_scores.columns))

time_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('time') , classifier.cv_results_.items())))
time_scores = time_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=time_scores[0].to_dict())
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', column.replace('_time', '')), time_scores.columns))

scores = pd.concat([train_scores, test_scores, time_scores], axis=1)
scores.index = pd.MultiIndex.from_frame(pd.DataFrame(classifier.cv_results_['params']))
scores.sort_values(('test_score', 'rank'))

Preprocessing effect: RandomForestClassifier: binary classification

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import OneHotEncoder, Binarizer, KBinsDiscretizer, PolynomialFeatures, SplineTransformer

def scoring(classifier, X, y, preprocessor_name, task_type, random_state=None):
    from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold, RepeatedKFold
    
    if task_type == 'binary':
        scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=random_state) # StratifiedKFold(n_splits=5, shuffle=False, random_state=None)  # cross validation & randomness control
    elif task_type == 'multi':
        scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=random_state) # StratifiedKFold(n_splits=5, shuffle=False, random_state=None)  # cross validation & randomness control
    elif task_type == 'reg':
        scoring = ['r2', 'explained_variance', 'max_error', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error', 'neg_median_absolute_error', 'neg_mean_absolute_percentage_error']
        cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=random_state) # KFold(n_splits=5, shuffle=False, random_state=None)
    
    scores = pd.DataFrame(cross_validate(classifier, X, y, cv=cv, scoring=scoring, return_train_score=True)).mean()
    scores.name = preprocessor_name
    return scores

def scoring_summary(scores):
    # summary
    train_scores = scores[list(filter(lambda column: column.startswith('train'), scores.columns))]
    test_scores = scores[list(filter(lambda column: column.startswith('test'), scores.columns))]
    time_scores = scores[list(filter(lambda column: column.endswith('time'), scores.columns))]
    train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train', '_'.join(column.split('_')[1:])), train_scores.columns))
    test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test', '_'.join(column.split('_')[1:])), test_scores.columns))
    time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', '_'.join(column.split('_')[:-1])), time_scores.columns))
    scores = pd.concat([train_scores, test_scores, time_scores], axis=1).swaplevel(0,1,axis=1)
    return scores 

random_state = None; task_type = 'binary'
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, weights=[0.9, 0.1], flip_y=0, random_state=random_state)

scores = list()
# transform of measure
params = dict(n_estimators=10, max_features='sqrt', criterion='gini', random_state=random_state)
scores.append(scoring(RandomForestClassifier(**params), X, y, preprocessor_name='baseline', task_type=task_type, random_state=random_state))
scores.append(scoring(RandomForestClassifier(**params), PCA(n_components=None).fit_transform(X), y, preprocessor_name='PCA', task_type=task_type, random_state=random_state))
scores.append(scoring(RandomForestClassifier(**params), FactorAnalysis(n_components=None, rotation='varimax').fit_transform(X), y, preprocessor_name='FactorAnalysis', task_type=task_type, random_state=random_state))
scores.append(scoring(RandomForestClassifier(**params), QuantileTransformer(output_distribution='normal').fit_transform(X), y, preprocessor_name='QuantileTransformer', task_type=task_type, random_state=random_state))
scores.append(scoring(RandomForestClassifier(**params), PowerTransformer(method='yeo-johnson', standardize=True).fit_transform(X), y, preprocessor_name='PowerTransform', task_type=task_type, random_state=random_state))
scores.append(scoring(RandomForestClassifier(**params), Normalizer().fit_transform(X), y, preprocessor_name='Normalizer', task_type=task_type, random_state=random_state))
scores.append(scoring(RandomForestClassifier(**params), StandardScaler().fit_transform(X), y, preprocessor_name='StandardScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(RandomForestClassifier(**params), MinMaxScaler().fit_transform(X), y, preprocessor_name='MinMaxScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(RandomForestClassifier(**params), MaxAbsScaler().fit_transform(X), y, preprocessor_name='MaxAbsScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(RandomForestClassifier(**params), RobustScaler().fit_transform(X), y, preprocessor_name='RobustScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(RandomForestClassifier(**params), KBinsDiscretizer(n_bins=[3]*10, encode='ordinal').fit_transform(X), y, preprocessor_name='KBinsDiscretizer', task_type=task_type, random_state=random_state))

# transform of sigma-algebra
scores.append(scoring(RandomForestClassifier(**params), Binarizer(threshold=0).fit_transform(X), y, preprocessor_name='Binarizer', task_type=task_type, random_state=random_state))
scores.append(scoring(RandomForestClassifier(**params), PolynomialFeatures(degree=2, interaction_only=False, include_bias=True).fit_transform(X), y, preprocessor_name='PolynomialFeatures', task_type=task_type, random_state=random_state))
scores.append(scoring(RandomForestClassifier(**params), SplineTransformer(degree=2, n_knots=3).fit_transform(X), y, preprocessor_name='SplineTransformer', task_type=task_type, random_state=random_state))
scores = pd.concat(scores, axis=1).T

# summary
scores = scoring_summary(scores)
scores[['accuracy', 'recall', 'precision', 'f1']]

Task: Regression

Validation: RandomForestRegressor

Extra Tree

Overview

from sklearn.datasets import make_classification, make_regression
from sklearn.tree import ExtraTreeClassifier, ExtraTreeRegressor

X, y = make_classification(n_samples=3000, n_features=10, n_classes=3, n_clusters_per_class=1, weights=[.6, .3, .1], flip_y=0)
classifier = ExtraTreeClassifier(criterion='gini')
classifier.fit(X, y)
classifier.predict(X)
classifier.predict_proba(X)

X, y = make_regression(n_samples=3000, n_features=10, n_informative=5, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, random_state=None)
regressor = ExtraTreeRegressor(criterion='squared_error')
regressor.fit(X, y)
regressor.predict(X)

Task: Classification

Validation: ExtraTreeClassifier: binary classification

import joblib
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc
from sklearn.tree import ExtraTreeClassifier


X, y = make_classification(n_samples=3000, n_features=10, n_classes=2, n_clusters_per_class=1, weights=[0.6, 0.4], flip_y=0)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=None) # cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=None) # cross validation & randomness control

#binary_class_scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
#multi_class_scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
classifier = make_pipeline(PowerTransformer(method='yeo-johnson', standardize=True), ExtraTreeClassifier(max_leaf_nodes=None, random_state=None))
classifier = GridSearchCV(
    estimator=classifier, cv=cv,
    scoring=['accuracy', 'recall', 'precision', 'f1'][0], 
    param_grid={
        'extratreeclassifier__criterion':["gini", "entropy", "log_loss"], 
        'extratreeclassifier__splitter':['random', 'best'], 
        'extratreeclassifier__min_samples_split':[2, 10, 30, 100], 
        'extratreeclassifier__min_samples_leaf':[1, 10, 30], 
        'extratreeclassifier__min_weight_fraction_leaf':[0.0], 
        'extratreeclassifier__max_features':['sqrt', 'log2'], 
        'extratreeclassifier__max_depth':[10, 20, 30], 
        'extratreeclassifier__min_impurity_decrease':[.01, .05, .1],
    },
    return_train_score=True
)
classifier.fit(X, y); joblib.dump(classifier, 'classifier.joblib')
classifier = joblib.load('classifier.joblib')
classifier.cv_results_


# Evaluation
train_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('train_score') , classifier.cv_results_.items())))
train_scores = train_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=train_scores[0].to_dict())
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train_score', column.replace('_train_score', '')), train_scores.columns))

test_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('test_score') , classifier.cv_results_.items())))
test_scores = test_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=test_scores[0].to_dict())
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test_score', column.replace('_test_score', '')), test_scores.columns))

time_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('time') , classifier.cv_results_.items())))
time_scores = time_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=time_scores[0].to_dict())
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', column.replace('_time', '')), time_scores.columns))

scores = pd.concat([train_scores, test_scores, time_scores], axis=1)
scores.index = pd.MultiIndex.from_frame(pd.DataFrame(classifier.cv_results_['params']))
scores.sort_values(('test_score', 'rank'))

Task: Regression

Validation: ExtraTreeRegressor

Reference

'artificial intelligence > machine learning' 카테고리의 다른 글

SVM Model (0)	2023.05.09
K-Nearest and Radius Neighbor Model (0)	2023.05.09
Ensemble Model (0)	2023.05.09
Naive Bayes Classifier and Discriminant Model (0)	2023.05.09
Linear Model (0)	2023.05.09

All-Together