Naive Bayes Classifier and Discriminant Model
Naive Bayes Classifier
Artificial Intelligence Learning Point: Group-wise Mean and Variance of Data
Bernoulli Naive Bayes Classifier
- Assumptions: conditional independence; naive assumption
For independent features with binary attributes on explanatory variable X and categorical response variable y,
(WIKIPEDIA) In the multivariate Bernoulli event model, features are independent Booleans (binary variables) describing inputs.
$${\displaystyle \textit{objective probabilistic model: } \quad p(C_{k}\mid \mathbf {x} )={\frac {p(C_{k})\ p(\mathbf {x} \mid C_{k})}{p(\mathbf {x} )}} = {\frac {p(C_{k})\ p(\mathbf {x} \mid C_{k})}{ \sum_{k} p(\mathbf {x} \mid C_{k}) p(C_{k})}} }$$ $${\displaystyle p(\mathbf {x} \mid C_{k})=\prod _{i=1}^{n}p_{ki}^{x_{i}}(1-p_{ki})^{(1-x_{i})}}$$
import numpy as np
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
np.random.seed(10)
df = pd.DataFrame(data=np.c_[np.random.randint(2, size=(100, 3)), np.random.randint(2, size=(100, 1))], columns=['X0', 'X1', 'X2', 'Y'])
df.loc[df.index[(df['X0'] == 0)|(df['X1'] == 0)|(df['X2'] == 0)].to_series().sample(frac=.8).index, ['Y']] = 1 # # dependence
model = BernoulliNB(class_prior=[.5, .5]) # default: class_prior=None > model.class_count_/model.class_count_.sum()
model.fit(df[['X0', 'X1', 'X2']], df['Y'])
# key points
model.class_count_ # df['Y'].value_counts()
model.feature_count_ # number of non-zero by Y class: pd.concat([df[['X0', 'X1', 'X2']].applymap(lambda x: 0 if x == 0 else 1), df[['Y']]], axis=1).groupby(['Y']).sum()
model.feature_log_prob_ # np.exp(model.feature_log_prob_)
target_features = df[['X0', 'X1', 'X2']].drop_duplicates().reset_index(drop=True)
probability = pd.concat([target_features, pd.DataFrame(model.predict_proba(target_features))], axis=1)
probability['RANK'] = probability[1].rank(ascending=False)
probability = probability.sort_values(by='RANK').reset_index(drop=True)
probability
INDEX | X0 | X1 | X2 | Y |
0 | 1 | 1 | 0 | 0 |
1 | 1 | 0 | 1 | 0 |
2 | 1 | 0 | 1 | 0 |
3 | 1 | 0 | 1 | 0 |
4 | 1 | 0 | 0 | 1 |
... | ... | ... | ... | |
95 | 1 | 1 | 0 | 1 |
96 | 1 | 1 | 1 | 1 |
97 | 1 | 0 | 1 | 1 |
98 | 1 | 1 | 0 | 1 |
99 | 0 | 1 | 0 | 0 |
Validation: BernoulliNB: binary classification
# https://scikit-learn.org/stable/modules/model_evaluation.html
import joblib
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.naive_bayes import BernoulliNB
df = pd.DataFrame(data=np.c_[np.random.randint(5, size=(100, 3)), np.random.randint(2, size=(100, 1))], columns=['X0', 'X1', 'X2', 'Y'])
df.loc[df.index[(df['X0'] == 0)|(df['X1'] == 0)|(df['X2'] == 0)].to_series().sample(frac=.8).index, ['Y']] = 1 # # dependence
X = df.loc[:, df.columns != 'Y'].values
y = df.loc[:, df.columns == 'Y'].values.ravel()
#binary_class_scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
#multi_class_scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=None) # cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=None) # cross validation & randomness control
classifier = make_pipeline(BernoulliNB())
classifier = GridSearchCV(
estimator=classifier, cv=cv,
scoring=['accuracy', 'recall', 'precision', 'f1'][1],
param_grid={
'bernoullinb__binarize' : [0, 1, 2],
'bernoullinb__class_prior':[(.1, .9), (.5, .5), (.9, .1), None],
'bernoullinb__alpha':[1, 2],
},
return_train_score=True,
)
classifier.fit(X, y) ; joblib.dump(classifier, 'classifier.joblib')
classifier = joblib.load('classifier.joblib')
classifier.cv_results_
# Evaluation
train_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('train_score') , classifier.cv_results_.items())))
train_scores = train_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=train_scores[0].to_dict())
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train_score', column.replace('_train_score', '')), train_scores.columns))
test_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('test_score') , classifier.cv_results_.items())))
test_scores = test_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=test_scores[0].to_dict())
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test_score', column.replace('_test_score', '')), test_scores.columns))
time_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('time') , classifier.cv_results_.items())))
time_scores = time_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=time_scores[0].to_dict())
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', column.replace('_time', '')), time_scores.columns))
scores = pd.concat([train_scores, test_scores, time_scores], axis=1)
scores.index = pd.MultiIndex.from_frame(pd.DataFrame(classifier.cv_results_['params']))
scores.sort_values(('test_score', 'rank'))
Preprocessing effect: BernoulliNB: binary classification
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.datasets import make_classification
from sklearn.naive_bayes import BernoulliNB
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import OneHotEncoder, Binarizer, KBinsDiscretizer, PolynomialFeatures, SplineTransformer
def scoring(classifier, X, y, preprocessor_name, task_type, random_state=None):
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold, RepeatedKFold
if task_type == 'binary':
scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=random_state) # StratifiedKFold(n_splits=5, shuffle=False, random_state=None) # cross validation & randomness control
elif task_type == 'multi':
scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=random_state) # StratifiedKFold(n_splits=5, shuffle=False, random_state=None) # cross validation & randomness control
elif task_type == 'reg':
scoring = ['r2', 'explained_variance', 'max_error', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error', 'neg_median_absolute_error', 'neg_mean_absolute_percentage_error']
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=random_state) # KFold(n_splits=5, shuffle=False, random_state=None)
scores = pd.DataFrame(cross_validate(classifier, X, y, cv=cv, scoring=scoring, return_train_score=True)).mean()
scores.name = preprocessor_name
return scores
def scoring_summary(scores):
# summary
train_scores = scores[list(filter(lambda column: column.startswith('train'), scores.columns))]
test_scores = scores[list(filter(lambda column: column.startswith('test'), scores.columns))]
time_scores = scores[list(filter(lambda column: column.endswith('time'), scores.columns))]
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train', '_'.join(column.split('_')[1:])), train_scores.columns))
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test', '_'.join(column.split('_')[1:])), test_scores.columns))
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', '_'.join(column.split('_')[:-1])), time_scores.columns))
scores = pd.concat([train_scores, test_scores, time_scores], axis=1).swaplevel(0,1,axis=1)
return scores
size = 10000
df = pd.DataFrame(
data=np.c_[
stats.bernoulli.rvs(p=1/5, size=size),
stats.bernoulli.rvs(p=2/5, size=size),
stats.bernoulli.rvs(p=2/5, size=size),
stats.randint.rvs(0, 2, size=size),
],
columns=['X0', 'X1', 'X2', 'Y']
)
# dependence
df.loc[df.index[
((df['X0'] == 1))|
((df['X1'] == 1))|
((df['X2'] == 1))].to_series().sample(frac=.8).index, ['Y']] = 1
random_state = None; task_type = 'binary'
X = df.loc[:, df.columns != 'Y'].values
y = df.loc[:, df.columns == 'Y'].values.ravel()
scores = list()
scores.append(scoring(BernoulliNB(), X, y, preprocessor_name='baseline', task_type=task_type, random_state=random_state))
# transform of sigma-algebra
scores.append(scoring(BernoulliNB(), Binarizer(threshold=0).fit_transform(X), y, preprocessor_name='Binarizer', task_type=task_type, random_state=random_state))
scores.append(scoring(BernoulliNB(), PolynomialFeatures(degree=2, interaction_only=False, include_bias=True).fit_transform(X), y, preprocessor_name='PolynomialFeatures', task_type=task_type, random_state=random_state))
scores.append(scoring(BernoulliNB(), SplineTransformer(degree=2, n_knots=3).fit_transform(X), y, preprocessor_name='SplineTransformer', task_type=task_type, random_state=random_state))
scores.append(scoring(BernoulliNB(), OneHotEncoder(min_frequency=100, max_categories=5, drop='if_binary', sparse_output=False).fit_transform(X), y, preprocessor_name='OneHotEncoder', task_type=task_type, random_state=random_state))
scores = pd.concat(scores, axis=1).T
# summary
scores = scoring_summary(scores)
scores[['accuracy', 'recall', 'precision', 'f1']]
Multinomial Naive Bayes Classifier
- Assumptions: conditional independence; naive assumption
For dependent features with discrete attributes on explanatory variable X and categorical response variable y,
(WIKIPEDIA) With a multinomial event model, samples (feature vectors) represent the frequencies with which certain events have been generated by a multinomial.
$${\displaystyle \textit{objective probabilistic model: } \quad p(C_{k}\mid \mathbf {x} )={\frac {p(C_{k})\ p(\mathbf {x} \mid C_{k})}{p(\mathbf {x} )}} = {\frac {p(C_{k})\ p(\mathbf {x} \mid C_{k})}{ \sum_{k} p(\mathbf {x} \mid C_{k}) p(C_{k})}} }$$ $${\displaystyle p(\mathbf {x} \mid C_{k})={\frac {(\sum _{i=1}^{n}x_{i})!}{\prod _{i=1}^{n}x_{i}!}}\prod _{i=1}^{n}{p_{ki}}^{x_{i}}}$$ $${\displaystyle \textit{where} \quad p_{ki}:=p(x_{i}\mid C_{k})}$$ $${\displaystyle {\begin{aligned}\log p(C_{k}\mid \mathbf {x} )&\varpropto \log \left(p(C_{k})\prod _{i=1}^{n}{p_{ki}}^{x_{i}}\right)\\&=\log p(C_{k})+\sum _{i=1}^{n}x_{i}\cdot \log p_{ki}\\&=b+\mathbf {w} _{k}^{\top }\mathbf {x} \end{aligned}}}$$import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
np.random.seed(10)
df = pd.DataFrame(
data=np.c_[
np.random.multinomial(n=60, pvals=[.3, .3, .4], size=(10000, )),
np.random.randint(2, size=(10000, 1))],
columns=['X0', 'X1', 'X2', 'Y']
)
# dependence
df.loc[df.index[
((df['X0'] > 10)&(df['X0'] < 30))|
((df['X1'] > 10)&(df['X1'] < 30))|
((df['X2'] > 10)&(df['X2'] < 20))].to_series().sample(frac=.8).index, ['Y']] = 1
model = MultinomialNB(class_prior=[.5, .5]) # default: class_prior=None > model.class_count_/model.class_count_.sum()
model.fit(df[['X0', 'X1', 'X2']], df['Y'])
# key points
model.class_count_ # df['Y'].value_counts()
model.feature_count_ # df.groupby(['Y']).sum()
model.feature_log_prob_ # np.exp(model.feature_log_prob_).sum(axis=1) == 1
target_features = df[['X0', 'X1', 'X2']].drop_duplicates().reset_index(drop=True)
probability = pd.concat([target_features, pd.DataFrame(model.predict_proba(target_features))], axis=1)
probability['RANK'] = probability[1].rank(ascending=False)
probability = probability.sort_values(by='RANK').reset_index(drop=True)
probability
INDEX | X0 | X1 | X2 | Y |
0 | 2 | 5 | 1 | 0 |
1 | 2 | 4 | 5 | 1 |
2 | 2 | 1 | 2 | 0 |
3 | 3 | 1 | 2 | 1 |
4 | 1 | 3 | 1 | 0 |
... | ... | ... | ... | |
95 | 3 | 2 | 2 | 1 |
96 | 1 | 1 | 2 | 1 |
97 | 1 | 2 | 4 | 0 |
98 | 1 | 2 | 3 | 0 |
99 | 2 | 4 | 5 | 0 |
Validation: MultinomialNB: binary classification
# https://scikit-learn.org/stable/modules/model_evaluation.html
import joblib
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
df = pd.DataFrame(
data=np.c_[
np.random.multinomial(n=60, pvals=[.3, .3, .4], size=(10000, )),
np.random.randint(2, size=(10000, 1))],
columns=['X0', 'X1', 'X2', 'Y']
)
# dependence
df.loc[df.index[
((df['X0'] > 10)&(df['X0'] < 30))|
((df['X1'] > 10)&(df['X1'] < 30))|
((df['X2'] > 10)&(df['X2'] < 20))].to_series().sample(frac=.8).index, ['Y']] = 1
X = df.loc[:, df.columns != 'Y'].values
y = df.loc[:, df.columns == 'Y'].values.ravel()
#binary_class_scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
#multi_class_scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=None) # cross validation & randomness control
classifier = make_pipeline(MultinomialNB())
classifier = GridSearchCV(
estimator=classifier, cv=cv,
scoring=['accuracy', 'recall', 'precision', 'f1'][0],
param_grid={
'multinomialnb__class_prior':[(.1,.9), (.2, .8), (.3, .7), (.4, .6), (.5, .5), None],
'multinomialnb__alpha':[1.0, 2.0],
},
return_train_score=True,
)
classifier.fit(X, y); joblib.dump(classifier, 'classifier.joblib')
classifier = joblib.load('classifier.joblib')
classifier.cv_results_
# Evaluation
train_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('train_score') , classifier.cv_results_.items())))
train_scores = train_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=train_scores[0].to_dict())
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train_score', column.replace('_train_score', '')), train_scores.columns))
test_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('test_score') , classifier.cv_results_.items())))
test_scores = test_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=test_scores[0].to_dict())
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test_score', column.replace('_test_score', '')), test_scores.columns))
time_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('time') , classifier.cv_results_.items())))
time_scores = time_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=time_scores[0].to_dict())
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', column.replace('_time', '')), time_scores.columns))
scores = pd.concat([train_scores, test_scores, time_scores], axis=1)
scores.index = pd.MultiIndex.from_frame(pd.DataFrame(classifier.cv_results_['params']))
scores.sort_values(('test_score', 'rank'))
Preprocessing effect: MultinomialNB: binary classification
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import OneHotEncoder, Binarizer, KBinsDiscretizer, PolynomialFeatures, SplineTransformer
def scoring(classifier, X, y, preprocessor_name, task_type, random_state=None):
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold, RepeatedKFold
if task_type == 'binary':
scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=random_state) # StratifiedKFold(n_splits=5, shuffle=False, random_state=None) # cross validation & randomness control
elif task_type == 'multi':
scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=random_state) # StratifiedKFold(n_splits=5, shuffle=False, random_state=None) # cross validation & randomness control
elif task_type == 'reg':
scoring = ['r2', 'explained_variance', 'max_error', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error', 'neg_median_absolute_error', 'neg_mean_absolute_percentage_error']
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=random_state) # KFold(n_splits=5, shuffle=False, random_state=None)
scores = pd.DataFrame(cross_validate(classifier, X, y, cv=cv, scoring=scoring, return_train_score=True)).mean()
scores.name = preprocessor_name
return scores
def scoring_summary(scores):
# summary
train_scores = scores[list(filter(lambda column: column.startswith('train'), scores.columns))]
test_scores = scores[list(filter(lambda column: column.startswith('test'), scores.columns))]
time_scores = scores[list(filter(lambda column: column.endswith('time'), scores.columns))]
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train', '_'.join(column.split('_')[1:])), train_scores.columns))
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test', '_'.join(column.split('_')[1:])), test_scores.columns))
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', '_'.join(column.split('_')[:-1])), time_scores.columns))
scores = pd.concat([train_scores, test_scores, time_scores], axis=1).swaplevel(0,1,axis=1)
return scores
df = pd.DataFrame(
data=np.c_[
np.random.multinomial(n=60, pvals=[.3, .3, .4], size=(10000, )),
np.random.randint(2, size=(10000, 1))],
columns=['X0', 'X1', 'X2', 'Y']
)
# dependence
df.loc[df.index[
((df['X0'] > 10)&(df['X0'] < 30))|
((df['X1'] > 10)&(df['X1'] < 30))|
((df['X2'] > 10)&(df['X2'] < 20))].to_series().sample(frac=.8).index, ['Y']] = 1
random_state = None; task_type = 'binary'
X = df.loc[:, df.columns != 'Y'].values
y = df.loc[:, df.columns == 'Y'].values.ravel()
scores = list()
scores.append(scoring(MultinomialNB(), X, y, preprocessor_name='baseline', task_type=task_type, random_state=random_state))
# transform of sigma-algebra
scores.append(scoring(MultinomialNB(), Binarizer(threshold=0).fit_transform(X), y, preprocessor_name='Binarizer', task_type=task_type, random_state=random_state))
scores.append(scoring(MultinomialNB(), PolynomialFeatures(degree=2, interaction_only=False, include_bias=True).fit_transform(X), y, preprocessor_name='PolynomialFeatures', task_type=task_type, random_state=random_state))
scores.append(scoring(MultinomialNB(), SplineTransformer(degree=2, n_knots=3).fit_transform(X), y, preprocessor_name='SplineTransformer', task_type=task_type, random_state=random_state))
scores.append(scoring(MultinomialNB(), OneHotEncoder(min_frequency=100, max_categories=5, drop='if_binary', sparse_output=False).fit_transform(X), y, preprocessor_name='OneHotEncoder', task_type=task_type, random_state=random_state))
scores = pd.concat(scores, axis=1).T
# summary
scores = scoring_summary(scores)
scores[['accuracy', 'recall', 'precision', 'f1']]
Gaussian Naive Bayes Classifier
- Assumptions: conditional independence; naive assumption
For continuous attributes on explanatory variable X and categorical response variable y
$${\displaystyle \textit{objective probabilistic model: } \quad p(C_{k}\mid \mathbf {x} )={\frac {p(C_{k})\ p(\mathbf {x} \mid C_{k})}{p(\mathbf {x} )}} = {\frac {p(C_{k})\ p(\mathbf {x} \mid C_{k})}{ \sum_{k} p(\mathbf {x} \mid C_{k}) p(C_{k})}} }$$ $${\displaystyle p(\mathbf {x} \mid C_{k})=\prod _{i=1}^{n}{\frac {1}{\sqrt {2\pi \sigma _{k}^{2}}}}\,e^{-{\frac {(x_{i}-\mu _{k})^{2}}{2\sigma _{k}^{2}}}}}$$import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
np.random.seed(10)
df = pd.DataFrame(
data=np.c_[
np.random.normal(0, 1, size=(100, 3)),
np.random.randint(2, size=(100, 1))],
columns=['X0', 'X1', 'X2', 'Y']
)
# dependence
df.loc[df.index[
((df['X0'] > -1)&(df['X0'] < 1))|
((df['X1'] > -1)&(df['X1'] < 1))|
((df['X2'] > -1)&(df['X2'] < 1))].to_series().sample(frac=.8).index, ['Y']] = 1
X = df.loc[:, df.columns != 'Y'].values
y = df.loc[:, df.columns == 'Y'].values.ravel()
model = GaussianNB(priors=[.5, .5]) # default: priors=None > model.class_prior_
model.fit(df[['X0', 'X1', 'X2']], df['Y'])
# key points
model.class_prior_
model.class_count_ # df['Y'].value_counts()
model.var_ # df.groupby(['Y']).var(ddof=0) + model.epsilon_
model.theta_ # df.groupby(['Y']).mean()
target_features = list()
target_features.append(df[['X0', 'X1', 'X2']].mean().to_frame().T)
target_features.append(df.loc[lambda x: x['Y'] == 0, ['X0', 'X1', 'X2']].mean().to_frame().T)
target_features.append(df.loc[lambda x: x['Y'] == 1, ['X0', 'X1', 'X2']].mean().to_frame().T)
target_features = pd.concat(target_features, axis=0).reset_index(drop=True)
probability = pd.concat([target_features, pd.DataFrame(model.predict_proba(target_features))], axis=1)
probability['RANK'] = probability[1].rank(ascending=False)
probability.index = ['all', 'y=0', 'y=1']
probability
gaussian_function = lambda X, y: ( 1/np.sqrt(2*np.pi*model.var_[y]) ) * np.exp( -1 * ( (X - model.theta_[y])**2 / 2*model.var_[y] ) ) # conditional probability density function for X given y
posterior = lambda x_, y_: gaussian_function(X=x_, y=y_).prod(axis=1).mul(model.class_prior_[y_]) # feature product under independent variable assumption
probability = lambda Y, X: posterior(x_=X, y_=Y) / (posterior(x_=X, y_=0) + posterior(x_=X, y_=1)) # conditional probability mass function for Y given X
probability_by_model = model.predict_proba(df[['X0', 'X1', 'X2']])
probability_by_formula = np.c_[probability(Y=0, X=df[['X0', 'X1', 'X2']]).values, probability(Y=1, X=df[['X0', 'X1', 'X2']]).values]
INDEX | X0 | X1 | X2 | Y |
0 | 1.331587 | 0.715279 | -1.545400 | 0 |
1 | -0.008384 | 0.621336 | -0.720086 | 1 |
2 | 0.265512 | 0.108549 | 0.004291 | 1 |
3 | -0.174600 | 0.433026 | 1.203037 | 1 |
4 | -0.965066 | 1.028274 | 0.228630 | 1 |
... | ... | ... | ... | ... |
95 | 0.308668 | 0.892564 | 0.011023 | 0 |
96 | 0.980074 | -2.395572 | -0.857523 | 1 |
97 | -0.364278 | 0.503927 | 0.188331 | 1 |
98 | 1.085227 | 0.356939 | 0.207330 | 0 |
99 | -0.145065 | 0.163904 | 0.829512 | 1 |
Vaildation: GaussianNB: binary classification
# https://scikit-learn.org/stable/modules/model_evaluation.html
import joblib
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, weights=[0.6, 0.4], flip_y=0)
#binary_class_scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
#multi_class_scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=None) # cross validation & randomness control
classifier = make_pipeline(PowerTransformer(method='yeo-johnson', standardize=True), Normalizer(), GaussianNB())
classifier = GridSearchCV(
estimator=classifier, cv=cv,
scoring=['accuracy', 'recall', 'precision', 'f1'][1],
param_grid={
'powertransformer__standardize':[True, False],
'gaussiannb__priors':[(.1, .9), (.5, .5), None],
'gaussiannb__var_smoothing':[1e-09, 1e-08, 1e-07],
},
return_train_score=True,
)
classifier.fit(X, y) ; joblib.dump(classifier, 'classifier.joblib')
classifier = joblib.load('classifier.joblib')
classifier.cv_results_
# Evaluation
train_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('train_score') , classifier.cv_results_.items())))
train_scores = train_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=train_scores[0].to_dict())
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train_score', column.replace('_train_score', '')), train_scores.columns))
test_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('test_score') , classifier.cv_results_.items())))
test_scores = test_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=test_scores[0].to_dict())
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test_score', column.replace('_test_score', '')), test_scores.columns))
time_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('time') , classifier.cv_results_.items())))
time_scores = time_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=time_scores[0].to_dict())
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', column.replace('_time', '')), time_scores.columns))
scores = pd.concat([train_scores, test_scores, time_scores], axis=1)
scores.index = pd.MultiIndex.from_frame(pd.DataFrame(classifier.cv_results_['params']))
scores.sort_values(('test_score', 'rank'))
Preprocessing effect: GaussianNB: binary classification
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import OneHotEncoder, Binarizer, KBinsDiscretizer, PolynomialFeatures, SplineTransformer
def scoring(classifier, X, y, preprocessor_name, task_type, random_state=None):
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold, RepeatedKFold
if task_type == 'binary':
scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=random_state) # StratifiedKFold(n_splits=5, shuffle=False, random_state=None) # cross validation & randomness control
elif task_type == 'multi':
scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=random_state) # StratifiedKFold(n_splits=5, shuffle=False, random_state=None) # cross validation & randomness control
elif task_type == 'reg':
scoring = ['r2', 'explained_variance', 'max_error', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error', 'neg_median_absolute_error', 'neg_mean_absolute_percentage_error']
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=random_state) # KFold(n_splits=5, shuffle=False, random_state=None)
scores = pd.DataFrame(cross_validate(classifier, X, y, cv=cv, scoring=scoring, return_train_score=True)).mean()
scores.name = preprocessor_name
return scores
def scoring_summary(scores):
# summary
train_scores = scores[list(filter(lambda column: column.startswith('train'), scores.columns))]
test_scores = scores[list(filter(lambda column: column.startswith('test'), scores.columns))]
time_scores = scores[list(filter(lambda column: column.endswith('time'), scores.columns))]
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train', '_'.join(column.split('_')[1:])), train_scores.columns))
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test', '_'.join(column.split('_')[1:])), test_scores.columns))
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', '_'.join(column.split('_')[:-1])), time_scores.columns))
scores = pd.concat([train_scores, test_scores, time_scores], axis=1).swaplevel(0,1,axis=1)
return scores
random_state = None; task_type = 'binary'
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, weights=[0.6, 0.4], flip_y=0, random_state=random_state)
scores = list()
# transform of measure
scores.append(scoring(GaussianNB(), X, y, preprocessor_name='baseline', task_type=task_type, random_state=random_state))
scores.append(scoring(GaussianNB(), PCA(n_components=None).fit_transform(X), y, preprocessor_name='PCA', task_type=task_type, random_state=random_state))
scores.append(scoring(GaussianNB(), FactorAnalysis(n_components=None, rotation='varimax').fit_transform(X), y, preprocessor_name='FactorAnalysis', task_type=task_type, random_state=random_state))
scores.append(scoring(GaussianNB(), QuantileTransformer(output_distribution='normal').fit_transform(X), y, preprocessor_name='QuantileTransformer', task_type=task_type, random_state=random_state))
scores.append(scoring(GaussianNB(), PowerTransformer(method='yeo-johnson', standardize=True).fit_transform(X), y, preprocessor_name='PowerTransform', task_type=task_type, random_state=random_state))
scores.append(scoring(GaussianNB(), Normalizer().fit_transform(X), y, preprocessor_name='Normalizer', task_type=task_type, random_state=random_state))
scores.append(scoring(GaussianNB(), StandardScaler().fit_transform(X), y, preprocessor_name='StandardScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(GaussianNB(), MinMaxScaler().fit_transform(X), y, preprocessor_name='MinMaxScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(GaussianNB(), MaxAbsScaler().fit_transform(X), y, preprocessor_name='MaxAbsScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(GaussianNB(), RobustScaler().fit_transform(X), y, preprocessor_name='RobustScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(GaussianNB(), KBinsDiscretizer(n_bins=[3]*10, encode='ordinal').fit_transform(X), y, preprocessor_name='KBinsDiscretizer', task_type=task_type, random_state=random_state))
# transform of sigma-algebra
scores.append(scoring(GaussianNB(), Binarizer(threshold=0).fit_transform(X), y, preprocessor_name='Binarizer', task_type=task_type, random_state=random_state))
scores.append(scoring(GaussianNB(), PolynomialFeatures(degree=2, interaction_only=False, include_bias=True).fit_transform(X), y, preprocessor_name='PolynomialFeatures', task_type=task_type, random_state=random_state))
scores.append(scoring(GaussianNB(), SplineTransformer(degree=2, n_knots=3).fit_transform(X), y, preprocessor_name='SplineTransformer', task_type=task_type, random_state=random_state))
scores = pd.concat(scores, axis=1).T
# summary
scores = scoring_summary(scores)
scores[['accuracy', 'recall', 'precision', 'f1']]
Discriminant Model
Artificial Intelligence Learning Point: Group-wise Mean and Covariance of Data
Linear Discriminant Analysis
- Assumption: conditional independence; the measurements from each class are normally distributed and identically Homoscedastic; normality & homoscedasticity
For continuous attributes on explanatory variable X and categorical response variable y
$${\displaystyle \text{Objective probabilistic model} }$$ $${\displaystyle p(C_{k}\mid \mathbf {x} )={\frac {p(C_{k})\ p(\mathbf {x} \mid C_{k})}{p(\mathbf {x} )}} = {\frac {p(C_{k})\ p(\mathbf {x} \mid C_{k})}{ \sum_{k} p(\mathbf {x} \mid C_{k}) p(C_{k})}} }$$ $${\displaystyle p(\mathbf {x} \mid C_{k})= {\frac{1}{{\sqrt {2\pi |\Sigma|}}} \exp \left(-{\frac {1}{2}}(\mathbf{x}-\mu_{k})^{T}\Sigma^{-1}(\mathbf{x}-\mu_{k} )\right)} }$$ $${\displaystyle \text{Log of posterior} }$$ $${\displaystyle \begin{align} \text{log} \, p(C_{k} \mid \mathbf {x}) &= \text{log}\,p(\mathbf {x}\mid C_{k} ) + \text{log} \, p(C_{k} ) + \textit{Constant} \\ \; &= -\frac{1}{2} \text{log} \, |\Sigma| -{\frac {1}{2}}(\mathbf{x}-\mu_{k})^{T}\Sigma^{-1}(\mathbf{x}-\mu_{k}) + \text{log} \, p(C_{k} ) + \textit{Constant} \\ \; &= w_{k}^{T} \mathbf {x} + w_{k0} + \textit{Constant} \\ \end{align} }$$import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
np.random.seed(10)
df = pd.DataFrame(
data=np.c_[
np.random.normal(0, 1, size=(100, 3)),
np.random.randint(5, size=(100, 1))],
columns=['X0', 'X1', 'X2', 'Y']
)
# dependence
df.loc[df.index[
((df['X0'] > -1)&(df['X0'] < 1))|
((df['X1'] > -1)&(df['X1'] < 1))|
((df['X2'] > -1)&(df['X2'] < 1))].to_series().sample(frac=.8).index, ['Y']] = 3
X = df.loc[:, df.columns != 'Y'].values
y = df.loc[:, df.columns == 'Y'].values.ravel()
model = LinearDiscriminantAnalysis(store_covariance=True, solver='svd', priors=df.groupby('Y')['Y'].count() / df.groupby('Y')['Y'].count().sum())
model.fit(X,y)
model.priors_ # df.groupby('Y')['Y'].count() / df.groupby('Y')['Y'].count().sum()
model.means_ # df.groupby(['Y']).mean()
model.covariance_ # df.groupby('Y')[['X0', 'X1', 'X2']].cov(ddof=1), df[['X0', 'X1', 'X2']].cov(ddof=1)
model.coef_
model.intercept_
model.predict_proba(X)
Validation: LinearDiscriminantAnalysis: binary classification
# https://scikit-learn.org/stable/modules/model_evaluation.html
import joblib
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
X, y = make_classification(n_samples=10000, n_features=10, n_classes=2, weights=[0.6, 0.4], flip_y=0)
#binary_class_scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
#multi_class_scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=None) # cross validation & randomness control
classifier = make_pipeline(PowerTransformer(method='yeo-johnson', standardize=False), Normalizer(), LinearDiscriminantAnalysis())
classifier = GridSearchCV(
estimator=classifier, cv=cv,
scoring=['accuracy', 'recall', 'precision', 'f1'][1],
param_grid={
'powertransformer__standardize':[True, False],
'lineardiscriminantanalysis__priors':[(.1, .9), (.5, .5), None],
'lineardiscriminantanalysis__solver':['svd', 'lsqr', 'eigen'],
},
return_train_score=True,
)
classifier.fit(X, y) ; joblib.dump(classifier, 'classifier.joblib')
classifier = joblib.load('classifier.joblib')
classifier.cv_results_
# Evaluation
train_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('train_score') , classifier.cv_results_.items())))
train_scores = train_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=train_scores[0].to_dict())
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train_score', column.replace('_train_score', '')), train_scores.columns))
test_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('test_score') , classifier.cv_results_.items())))
test_scores = test_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=test_scores[0].to_dict())
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test_score', column.replace('_test_score', '')), test_scores.columns))
time_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('time') , classifier.cv_results_.items())))
time_scores = time_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=time_scores[0].to_dict())
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', column.replace('_time', '')), time_scores.columns))
scores = pd.concat([train_scores, test_scores, time_scores], axis=1)
scores.index = pd.MultiIndex.from_frame(pd.DataFrame(classifier.cv_results_['params']))
scores.sort_values(('test_score', 'rank'))
Preprocessing effect: LinearDiscriminantAnalysis: binary classification
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import OneHotEncoder, Binarizer, KBinsDiscretizer, PolynomialFeatures, SplineTransformer
def scoring(classifier, X, y, preprocessor_name, task_type, random_state=None):
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold, RepeatedKFold
if task_type == 'binary':
scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=random_state) # StratifiedKFold(n_splits=5, shuffle=False, random_state=None) # cross validation & randomness control
elif task_type == 'multi':
scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=random_state) # StratifiedKFold(n_splits=5, shuffle=False, random_state=None) # cross validation & randomness control
elif task_type == 'reg':
scoring = ['r2', 'explained_variance', 'max_error', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error', 'neg_median_absolute_error', 'neg_mean_absolute_percentage_error']
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=random_state) # KFold(n_splits=5, shuffle=False, random_state=None)
scores = pd.DataFrame(cross_validate(classifier, X, y, cv=cv, scoring=scoring, return_train_score=True)).mean()
scores.name = preprocessor_name
return scores
def scoring_summary(scores):
# summary
train_scores = scores[list(filter(lambda column: column.startswith('train'), scores.columns))]
test_scores = scores[list(filter(lambda column: column.startswith('test'), scores.columns))]
time_scores = scores[list(filter(lambda column: column.endswith('time'), scores.columns))]
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train', '_'.join(column.split('_')[1:])), train_scores.columns))
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test', '_'.join(column.split('_')[1:])), test_scores.columns))
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', '_'.join(column.split('_')[:-1])), time_scores.columns))
scores = pd.concat([train_scores, test_scores, time_scores], axis=1).swaplevel(0,1,axis=1)
return scores
random_state = None; task_type = 'binary'
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, weights=[0.6, 0.4], flip_y=0, random_state=random_state)
scores = list()
# transform of measure
scores.append(scoring(LinearDiscriminantAnalysis(), X, y, preprocessor_name='baseline', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), PCA(n_components=None).fit_transform(X), y, preprocessor_name='PCA', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), FactorAnalysis(n_components=None, rotation='varimax').fit_transform(X), y, preprocessor_name='FactorAnalysis', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), QuantileTransformer(output_distribution='normal').fit_transform(X), y, preprocessor_name='QuantileTransformer', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), PowerTransformer(method='yeo-johnson', standardize=True).fit_transform(X), y, preprocessor_name='PowerTransform', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), Normalizer().fit_transform(X), y, preprocessor_name='Normalizer', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), StandardScaler().fit_transform(X), y, preprocessor_name='StandardScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), MinMaxScaler().fit_transform(X), y, preprocessor_name='MinMaxScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), MaxAbsScaler().fit_transform(X), y, preprocessor_name='MaxAbsScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), RobustScaler().fit_transform(X), y, preprocessor_name='RobustScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), KBinsDiscretizer(n_bins=10, encode='ordinal').fit_transform(X), y, preprocessor_name='KBinsDiscretizer', task_type=task_type, random_state=random_state))
# transform of sigma-algebra
scores.append(scoring(LinearDiscriminantAnalysis(), Binarizer(threshold=0).fit_transform(X), y, preprocessor_name='Binarizer', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), Binarizer(threshold=0).fit_transform(StandardScaler().fit_transform(X)), y, preprocessor_name='StandardScaler&Binarizer', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), PolynomialFeatures(degree=2, interaction_only=False, include_bias=True).fit_transform(X), y, preprocessor_name='PolynomialFeatures', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), SplineTransformer(degree=2, n_knots=3).fit_transform(X), y, preprocessor_name='SplineTransformer', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), PolynomialFeatures(degree=2, interaction_only=False, include_bias=True).fit_transform(SplineTransformer(degree=2, n_knots=3).fit_transform(X)), y, preprocessor_name='SplineTransformer&PolynomialFeatures', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), PolynomialFeatures(degree=2, interaction_only=True, include_bias=True).fit_transform(SplineTransformer(degree=2, n_knots=3).fit_transform(X)), y, preprocessor_name='SplineTransformer&InteractionPolynomialFeatures', task_type=task_type, random_state=random_state))
scores = pd.concat(scores, axis=1).T
# summary
scores = scoring_summary(scores)
scores[['accuracy', 'recall', 'precision', 'f1']]
Conditional Covariance Analysis: LinearDiscriminantAnalysis: binary classification
#
Validation: LinearDiscriminantAnalysis: multi-class classification
import joblib
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
X, y = make_classification(n_samples=3000, n_features=10, n_informative=3, n_redundant=2, n_repeated=0, n_classes=5, n_clusters_per_class=1, weights=[0.1, 0.3, 0.3, .1, .2], flip_y=0)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=None) # cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=None) # cross validation & randomness control
#binary_class_scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
#multi_class_scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
classifier = make_pipeline(PowerTransformer(method='yeo-johnson', standardize=False), Normalizer(), LinearDiscriminantAnalysis())
classifier = GridSearchCV(
estimator=classifier, cv=cv,
scoring=['accuracy', 'recall_macro', 'precision_macro', 'f1_macro'][1],
param_grid={
'powertransformer__standardize':[True, False],
'lineardiscriminantanalysis__priors':[(.1, .1, .1, .1, .6), (.2, .2, .2, .2, .2), None],
'lineardiscriminantanalysis__solver':['svd', 'lsqr', 'eigen'],
},
return_train_score=True
)
classifier.fit(X, y); joblib.dump(classifier, 'classifier.joblib')
classifier = joblib.load('classifier.joblib')
classifier.cv_results_
# Evaluation
train_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('train_score') , classifier.cv_results_.items())))
train_scores = train_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=train_scores[0].to_dict())
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train_score', column.replace('_train_score', '')), train_scores.columns))
test_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('test_score') , classifier.cv_results_.items())))
test_scores = test_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=test_scores[0].to_dict())
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test_score', column.replace('_test_score', '')), test_scores.columns))
time_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('time') , classifier.cv_results_.items())))
time_scores = time_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=time_scores[0].to_dict())
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', column.replace('_time', '')), time_scores.columns))
scores = pd.concat([train_scores, test_scores, time_scores], axis=1)
scores.index = pd.MultiIndex.from_frame(pd.DataFrame(classifier.cv_results_['params']))
scores.sort_values(('test_score', 'rank'))
Preprocessing effect: LinearDiscriminantAnalysis: multi-class classification
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import OneHotEncoder, Binarizer, KBinsDiscretizer, PolynomialFeatures, SplineTransformer
def scoring(classifier, X, y, preprocessor_name, task_type, random_state=None):
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold, RepeatedKFold
if task_type == 'binary':
scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=random_state) # StratifiedKFold(n_splits=5, shuffle=False, random_state=None) # cross validation & randomness control
elif task_type == 'multi':
scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=random_state) # StratifiedKFold(n_splits=5, shuffle=False, random_state=None) # cross validation & randomness control
elif task_type == 'reg':
scoring = ['r2', 'explained_variance', 'max_error', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error', 'neg_median_absolute_error', 'neg_mean_absolute_percentage_error']
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=random_state) # KFold(n_splits=5, shuffle=False, random_state=None)
scores = pd.DataFrame(cross_validate(classifier, X, y, cv=cv, scoring=scoring, return_train_score=True)).mean()
scores.name = preprocessor_name
return scores
def scoring_summary(scores):
# summary
train_scores = scores[list(filter(lambda column: column.startswith('train'), scores.columns))]
test_scores = scores[list(filter(lambda column: column.startswith('test'), scores.columns))]
time_scores = scores[list(filter(lambda column: column.endswith('time'), scores.columns))]
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train', '_'.join(column.split('_')[1:])), train_scores.columns))
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test', '_'.join(column.split('_')[1:])), test_scores.columns))
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', '_'.join(column.split('_')[:-1])), time_scores.columns))
scores = pd.concat([train_scores, test_scores, time_scores], axis=1).swaplevel(0,1,axis=1)
return scores
random_state = None; task_type = 'multi'
X, y = make_classification(n_samples=3000, n_features=10, n_informative=3, n_redundant=2, n_repeated=0, n_classes=5, n_clusters_per_class=1, weights=[0.1, 0.3, 0.3, .1, .2], flip_y=0, random_state=random_state)
scores = list()
# transform of measure
scores.append(scoring(LinearDiscriminantAnalysis(), X, y, preprocessor_name='baseline', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), PCA(n_components=None).fit_transform(X), y, preprocessor_name='PCA', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), FactorAnalysis(n_components=None, rotation='varimax').fit_transform(X), y, preprocessor_name='FactorAnalysis', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), QuantileTransformer(output_distribution='normal').fit_transform(X), y, preprocessor_name='QuantileTransformer', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), PowerTransformer(method='yeo-johnson', standardize=True).fit_transform(X), y, preprocessor_name='PowerTransform', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), Normalizer().fit_transform(X), y, preprocessor_name='Normalizer', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), StandardScaler().fit_transform(X), y, preprocessor_name='StandardScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), MinMaxScaler().fit_transform(X), y, preprocessor_name='MinMaxScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), MaxAbsScaler().fit_transform(X), y, preprocessor_name='MaxAbsScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), RobustScaler().fit_transform(X), y, preprocessor_name='RobustScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), KBinsDiscretizer(n_bins=10, encode='ordinal').fit_transform(X), y, preprocessor_name='KBinsDiscretizer', task_type=task_type, random_state=random_state))
# transform of sigma-algebra
scores.append(scoring(LinearDiscriminantAnalysis(), Binarizer(threshold=0).fit_transform(X), y, preprocessor_name='Binarizer', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), Binarizer(threshold=0).fit_transform(StandardScaler().fit_transform(X)), y, preprocessor_name='StandardScaler&Binarizer', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), PolynomialFeatures(degree=2, interaction_only=False, include_bias=True).fit_transform(X), y, preprocessor_name='PolynomialFeatures', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), SplineTransformer(degree=2, n_knots=3).fit_transform(X), y, preprocessor_name='SplineTransformer', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), PolynomialFeatures(degree=2, interaction_only=False, include_bias=True).fit_transform(SplineTransformer(degree=2, n_knots=3).fit_transform(X)), y, preprocessor_name='SplineTransformer&PolynomialFeatures', task_type=task_type, random_state=random_state))
scores.append(scoring(LinearDiscriminantAnalysis(), PolynomialFeatures(degree=2, interaction_only=True, include_bias=True).fit_transform(SplineTransformer(degree=2, n_knots=3).fit_transform(X)), y, preprocessor_name='SplineTransformer&InteractionPolynomialFeatures', task_type=task_type, random_state=random_state))
scores = pd.concat(scores, axis=1).T
# summary
scores = scoring_summary(scores)
scores[['accuracy', 'recall_macro', 'precision_macro', 'f1_macro']]
Conditional Covariance Analysis: LinearDiscriminantAnalysis: multi-class classification
#
Quadratic Discriminant Analysis
- Assumptions: conditional independence; normality
For continuous attributes on explanatory variable X and categorical response variable y
(Note) No assumption: the covariance of each of the classes is identical
$${\displaystyle \text{Objective probabilistic model} }$$ $${\displaystyle p(C_{k}\mid \mathbf {x} )={\frac {p(C_{k})\ p(\mathbf {x} \mid C_{k})}{p(\mathbf {x} )}} = {\frac {p(C_{k})\ p(\mathbf {x} \mid C_{k})}{ \sum_{k} p(\mathbf {x} \mid C_{k}) p(C_{k})}} }$$ $${\displaystyle p(\mathbf {x} \mid C_{k})= {\frac{1}{{\sqrt {2\pi |\Sigma_{k}|}}} \exp \left(-{\frac {1}{2}}(\mathbf{x}-\mu_{k})^{T}\Sigma_{k}^{-1}(\mathbf{x}-\mu_{k} )\right)} }$$ $${\displaystyle \text{Log of posterior} }$$ $${\displaystyle \begin{align} \text{log} \, p(C_{k} \mid \mathbf {x}) &= \text{log}\,p(\mathbf {x}\mid C_{k} ) + \text{log} \, p(C_{k} ) + \textit{Constant} \\ \; &= -\frac{1}{2} \text{log} \, |\Sigma_{k}| -{\frac {1}{2}}(\mathbf{x}-\mu_{k})^{T}\Sigma_{k}^{-1}(\mathbf{x}-\mu_{k}) + \text{log} \, p(C_{k} ) + \textit{Constant} \\ \end{align} }$$import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
np.random.seed(10)
df = pd.DataFrame(
data=np.c_[
np.random.normal(0, 1, size=(100, 3)),
np.random.randint(5, size=(100, 1))],
columns=['X0', 'X1', 'X2', 'Y']
)
# dependence
df.loc[df.index[
((df['X0'] > -1)&(df['X0'] < 1))|
((df['X1'] > -1)&(df['X1'] < 1))|
((df['X2'] > -1)&(df['X2'] < 1))].to_series().sample(frac=.8).index, ['Y']] = 3
X = df.loc[:, df.columns != 'Y'].values
y = df.loc[:, df.columns == 'Y'].values.ravel()
model = QuadraticDiscriminantAnalysis(store_covariance=True, priors=df.groupby('Y')['Y'].count() / df.groupby('Y')['Y'].count().sum())
model.fit(X,y)
model.priors_ # df.groupby('Y')['Y'].count() / df.groupby('Y')['Y'].count().sum()
model.means_ # df.groupby(['Y']).mean()
model.covariance_ # df.groupby('Y').cov(ddof=1)
model.rotations_
model.predict_proba(X)
Validation: QuadraticDiscriminantAnalysis: binary classification
# https://scikit-learn.org/stable/modules/model_evaluation.html
import joblib
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, StratifiedKFold
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, weights=[0.6, 0.4], flip_y=0)
#binary_class_scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
#multi_class_scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=None) # cross validation & randomness control
classifier = make_pipeline(PowerTransformer(method='yeo-johnson', standardize=True), Normalizer(), QuadraticDiscriminantAnalysis())
classifier = GridSearchCV(
estimator=classifier, cv=cv,
scoring=['accuracy', 'recall', 'precision', 'f1'][1],
param_grid={
'powertransformer__standardize':[True, False],
'quadraticdiscriminantanalysis__priors':[(.1, .9), (.5, .5), None],
'quadraticdiscriminantanalysis__reg_param':[.0, .1, .2],
},
return_train_score=True,
)
classifier.fit(X, y) ; joblib.dump(classifier, 'classifier.joblib')
classifier = joblib.load('classifier.joblib')
classifier.cv_results_
# Evaluation
train_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('train_score') , classifier.cv_results_.items())))
train_scores = train_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=train_scores[0].to_dict())
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train_score', column.replace('_train_score', '')), train_scores.columns))
test_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('test_score') , classifier.cv_results_.items())))
test_scores = test_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=test_scores[0].to_dict())
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test_score', column.replace('_test_score', '')), test_scores.columns))
time_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('time') , classifier.cv_results_.items())))
time_scores = time_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=time_scores[0].to_dict())
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', column.replace('_time', '')), time_scores.columns))
scores = pd.concat([train_scores, test_scores, time_scores], axis=1)
scores.index = pd.MultiIndex.from_frame(pd.DataFrame(classifier.cv_results_['params']))
scores.sort_values(('test_score', 'rank'))
Preprocessing effect: QuadraticDiscriminantAnalysis: binary classification
#
Conditional Covariance Analysis: QuadraticDiscriminantAnalysis: binary classification
#
Validation: QuadraticDiscriminantAnalysis: multi-class classification
#
Preprocessing effect: QuadraticDiscriminantAnalysis: multi-class classification
#
Conditional Covariance Analysis: QuadraticDiscriminantAnalysis: multi-class classification
#