Linear Regressor
Overview
import numpy as np
from sklearn.datasets import make_classification, make_regression
from sklearn import linear_model
X, y = make_regression(n_samples=3000, n_features=10, n_informative=5, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, random_state=None)
regressor = linear_model.LinearRegression()
regressor = linear_model.Ridge(alpha=.5)
regressor = linear_model.Lasso(alpha=0.1)
regressor = linear_model.LassoLars(alpha=.1, normalize=False)
regressor = linear_model.ElasticNet()
regressor = linear_model.BayesianRidge()
regressor = linear_model.SGDRegressor()
regressor.fit(X, y)
regressor.predict(X)
Comparison with statsmodels
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
# data
data = pd.DataFrame(np.c_[
stats.binom.rvs(n=2, p=2/3, size=1000), # X0: categorical explainatory variable
stats.norm.rvs(1,1, size=1000), # X1: numerical explainatory variable
stats.norm.rvs(0,1, size=1000), # Y: response variable
], columns=['X0', 'X1', 'Y'])
# sklearn
model = make_pipeline(make_column_transformer((OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['X0']), (FunctionTransformer(lambda X: X - X.mean(axis=0)), ['X1'])), LinearRegression(fit_intercept=False))
model.fit(data[['X0', 'X1']], data['Y']); print(model['linearregression'].coef_.round(4))
# statsmodels
centering_data = data.copy()
centering_data.loc[:, ['X1']] = data[['X1']] - data[['X1']].mean(axis=0)
model = sm.OLS.from_formula("Y ~ C(X0) + X1 -1", data=centering_data).fit(); print(model.params.round(4).tolist())
Task: Regression
Validation: LinearRegression: regression
# https://scikit-learn.org/stable/modules/model_evaluation.html
import joblib
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
X, y = make_regression(n_samples=3000, n_features=10, n_informative=5, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, random_state=None)
cv = KFold(n_splits=10, shuffle=False, random_state=None)
# version1: multiple parameters
regressor = make_pipeline(PowerTransformer(method='yeo-johnson', standardize=True), Normalizer(), LinearRegression())
regressor = GridSearchCV(
estimator=regressor, cv=cv,
scoring=['r2', 'explained_variance', 'max_error', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error', 'neg_median_absolute_error', 'neg_mean_absolute_percentage_error'][1],
param_grid={
'linearregression__fit_intercept':[False, True],
'linearregression__positive':[False, True]
},
return_train_score=True)
regressor.fit(X, y); joblib.dump(regressor, 'regressor.joblib')
regressor = joblib.load('regressor.joblib')
# Evaluation
train_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('train_score') , regressor.cv_results_.items())))
train_scores = train_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=train_scores[0].to_dict())
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train_score', column.replace('_train_score', '')), train_scores.columns))
test_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('test_score') , regressor.cv_results_.items())))
test_scores = test_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=test_scores[0].to_dict())
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test_score', column.replace('_test_score', '')), test_scores.columns))
time_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('time') , regressor.cv_results_.items())))
time_scores = time_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=time_scores[0].to_dict())
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', column.replace('_time', '')), time_scores.columns))
scores = pd.concat([train_scores, test_scores, time_scores], axis=1)
scores.index = pd.MultiIndex.from_frame(pd.DataFrame(regressor.cv_results_['params']))
scores.sort_values(('test_score', 'rank'))
Covariance Analysis: LinearRegression: regression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn import metrics
def covariance_and_correlation(data, title=None, annot=True, fmt='.2g', cmap="RdBu",):
title = '(' + title + '): ' if title is not None else ''
fig = plt.figure(figsize=(30,30))
ax00 = plt.subplot2grid((1,2), (0,0))
ax00.set_title(title + 'Covariance')
sns.heatmap(
pd.DataFrame(data).cov().round(4),
cmap=cmap,
center=0,
square=True,
linewidths=0.5,
cbar_kws={"shrink": 0.1},
annot=annot, fmt=fmt,
ax=ax00
)
ax01 = plt.subplot2grid((1,2), (0,1))
ax01.set_title(title + 'Correlation | Standardized Features Covariance')
sns.heatmap(
pd.DataFrame(data).corr(method=['pearson', 'spearman', 'kendall'][0]).round(4),
cmap=cmap,
center=0,
square=True,
linewidths=0.5,
cbar_kws={"shrink": 0.1},
annot=annot, fmt=fmt,
ax=ax01
)
return fig
# regression
X, y = make_regression(n_samples=3000, n_features=10, n_informative=5, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, random_state=None)
X1 = X - X.mean(axis=0) # centering
X2 = (X - X.mean(axis=0)) / X.std(ddof=1, axis=0) # standardizing
y1 = y - y.mean(axis=0) # centering
y2 = (y - y.mean(axis=0)) / y.std(ddof=1, axis=0) # standardizing
P1 = PCA().fit_transform(X1)
P2 = PCA().fit_transform(X2)
regressors = dict(); scores = dict()
scores['r2'] = dict(); scores['mse'] = dict(); scores['mae'] = dict(); scores['mape'] = dict()
regression_steps = [('X1y', (X1, y)), ('X2y', (X2, y)), ('X1y1', (X1, y1)), ('X2y1', (X2, y1)), ('X1y2', (X1, y2)), ('X2y2', (X2, y2)), ('P1y', (P1, y)), ('P2y', (P2, y)), ('P1y1', (P1, y1)), ('P2y1', (P2, y1)), ('P1y2', (P1, y2)), ('P2y2', (P2, y2))]
for step, (X_, y_) in regression_steps:
# exploratory training
regressors[step] = LinearRegression().fit(X_, y_)
# scoring
scores['r2'][step] = metrics.r2_score(y_, regressors[step].predict(X_)).round(4)
scores['mse'][step] = metrics.mean_squared_error(y_, regressors[step].predict(X_)).round(4)
scores['mae'][step] = metrics.mean_absolute_error(y_, regressors[step].predict(X_)).round(4)
scores['mape'][step] = metrics.mean_absolute_percentage_error(y_, regressors[step].predict(X_)).round(4)
# covariance visualization
covariance_and_correlation(data=np.c_[X1, y1], title='Centered X & Centered y', annot=True, fmt='.2g', cmap="RdBu")
covariance_and_correlation(data=np.c_[X2, y1], title='Standardized X & Centered y', annot=True, fmt='.2g', cmap="RdBu")
covariance_and_correlation(data=P1, title='Principle Component of Centered X', annot=True, fmt='.2g', cmap="RdBu")
covariance_and_correlation(data=P2, title='Principle Component of Standardized X', annot=True, fmt='.2g', cmap="RdBu")
plt.tight_layout()
# regression coefficients
covariance_analysis = pd.DataFrame(
data=np.concatenate([regressor.coef_.round(0)[:, np.newaxis] for regressor in regressors.values()], axis=1).T,
index=regressors.keys())
covariance_analysis.insert(0, 'r2', scores['r2'].values())
covariance_analysis.insert(1, 'mse', scores['mse'].values())
covariance_analysis.insert(2, 'mae', scores['mae'].values())
covariance_analysis.insert(3, 'mape', scores['mape'].values())
covariance_analysis
Linear Classifier
Overview
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
# binary classification
X, y = make_classification(n_samples=3000, n_features=10, n_classes=2, n_clusters_per_class=1, weights=[0.6, 0.4], flip_y=0)
X_PCA = PCA().fit_transform(X)
classifier1 = LogisticRegression(solver='newton-cholesky', penalty='l2').fit(X, y)
classifier2 = LogisticRegression(solver='newton-cholesky', penalty='l2').fit(X_PCA, y)
print(classifier1.coef_.round(0))
print(classifier2.coef_.round(0))
# multiclass classification
X, y = make_classification(n_samples=3000, n_features=10, n_informative=3, n_classes=5, n_clusters_per_class=1, weights=[0.1, 0.3, 0.3, .1, .2], flip_y=0)
X_PCA = PCA().fit_transform(X)
classifier1 = LogisticRegression(solver='lbfgs', penalty='l2').fit(X, y)
classifier2 = LogisticRegression(solver='lbfgs', penalty='l2').fit(X_PCA, y)
print(classifier1.coef_.round(0))
print(classifier2.coef_.round(0))
classification | solver | penalty |
small dataset | liblinear | l1(lasso), l2(ridge) |
binary class | netwon-cholesky | l2(ridge) |
multi-class | newton-cg | l2(ridge) |
multi-class | sag | l2(ridge) |
multi-class | saga | elasticnet, l1(lasso), l2(ridge) |
multi-class | lbfgs | l2(ridge) |
Comparison with statsmodels
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
# data
data = pd.DataFrame(np.c_[
stats.binom.rvs(n=2, p=2/3, size=1000), # X0: categorical explainatory variable
stats.norm.rvs(1,1, size=1000), # X1: numerical explainatory variable
stats.bernoulli.rvs(p=2/3, size=1000), # Y: response variable
], columns=['X0', 'X1', 'Y'])
# sklearn
model = make_pipeline(make_column_transformer((OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['X0']), (FunctionTransformer(lambda X: X - X.mean(axis=0)), ['X1'])), LogisticRegression(solver='newton-cg', penalty=None, fit_intercept=False))
model.fit(data[['X0', 'X1']], data['Y']); print(model['logisticregression'].coef_.round(4))
# statsmodels
centering_data = data.copy()
centering_data.loc[:, ['X1']] = data[['X1']] - data[['X1']].mean(axis=0)
model = sm.Logit.from_formula("Y ~ C(X0) + X1 -1", data=centering_data).fit(); print(model.params.round(4).tolist())
Task: Binary classification
from sklearn.datasets import make_classification, make_regression
from sklearn import linear_model
X, y = make_classification(n_samples=3000, n_features=10, n_classes=2, n_clusters_per_class=1, weights=[0.6, 0.4], flip_y=0)
classifier = linear_model.LogisticRegression(solver='newton-cholesky', penalty='l2')
classifier.fit(X, y)
classifier.predict(X)
classifier.predict_proba(X)
Validation: LogisticRegression: binary classification
import joblib
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc
from sklearn.linear_model import LogisticRegression
X, y = make_classification(n_samples=3000, n_features=10, n_classes=2, n_clusters_per_class=1, weights=[0.6, 0.4], flip_y=0)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=None) # cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=None) # cross validation & randomness control
#binary_class_scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
#multi_class_scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
classifier = make_pipeline(PowerTransformer(method='yeo-johnson', standardize=True), Normalizer(), LogisticRegression())
classifier = GridSearchCV(
estimator=classifier, cv=cv,
scoring=['accuracy', 'recall', 'precision', 'f1'][0],
param_grid={
'logisticregression__penalty' : ['l2', None][0:1],
'logisticregression__fit_intercept' : [True, False],
'logisticregression__intercept_scaling' : [1,2,3],
'logisticregression__C' : [1.0, .5, .3, .1, .01, .001],
# 'logisticregression__class_weight' : ['balanced', None],
'logisticregression__solver' : ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'][3:4],
},
return_train_score=True
)
classifier.fit(X, y); joblib.dump(classifier, 'classifier.joblib')
classifier = joblib.load('classifier.joblib')
classifier.cv_results_
# Evaluation
train_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('train_score') , classifier.cv_results_.items())))
train_scores = train_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=train_scores[0].to_dict())
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train_score', column.replace('_train_score', '')), train_scores.columns))
test_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('test_score') , classifier.cv_results_.items())))
test_scores = test_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=test_scores[0].to_dict())
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test_score', column.replace('_test_score', '')), test_scores.columns))
time_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('time') , classifier.cv_results_.items())))
time_scores = time_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=time_scores[0].to_dict())
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', column.replace('_time', '')), time_scores.columns))
scores = pd.concat([train_scores, test_scores, time_scores], axis=1)
scores.index = pd.MultiIndex.from_frame(pd.DataFrame(classifier.cv_results_['params']))
scores.sort_values(('test_score', 'rank'))
Covariance Analysis: LogisticRegression: binary classification
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn import metrics
def covariance_and_correlation(data, title=None, annot=True, fmt='.2g', cmap="RdBu",):
title = '(' + title + '): ' if title is not None else ''
fig = plt.figure(figsize=(30,30))
ax00 = plt.subplot2grid((1,2), (0,0))
ax00.set_title(title + 'Covariance')
sns.heatmap(
pd.DataFrame(data).cov().round(4),
cmap=cmap,
center=0,
square=True,
linewidths=0.5,
cbar_kws={"shrink": 0.1},
annot=annot, fmt=fmt,
ax=ax00
)
ax01 = plt.subplot2grid((1,2), (0,1))
ax01.set_title(title + 'Correlation | Standardized Features Covariance')
sns.heatmap(
pd.DataFrame(data).corr(method=['pearson', 'spearman', 'kendall'][0]).round(4),
cmap=cmap,
center=0,
square=True,
linewidths=0.5,
cbar_kws={"shrink": 0.1},
annot=annot, fmt=fmt,
ax=ax01
)
return fig
# binary classification
X, y = make_classification(n_samples=3000, n_features=10, n_redundant=1, n_repeated=2, n_classes=2, n_clusters_per_class=1, weights=[0.6, 0.4], flip_y=0)
X1 = X - X.mean(axis=0) # centering
X2 = (X - X.mean(axis=0)) / X.std(ddof=1, axis=0) # standardizing
y1 = y - y.mean(axis=0) # centering
y2 = (y - y.mean(axis=0)) / y.std(ddof=1, axis=0) # standardizing
P1 = PCA().fit_transform(X1)
P2 = PCA().fit_transform(X2)
classifiers = dict(); scores = dict()
scores['accuracy'] = dict(); scores['recall'] = dict(); scores['precision'] = dict(); scores['f1'] = dict()
classification_steps = [('X1y', (X1, y)), ('X2y', (X2, y)), ('P1y', (P1, y)), ('P2y', (P2, y)),]
for step, (X_, y_) in classification_steps:
# exploratory training
classifiers[step] = LogisticRegression(solver='newton-cholesky', penalty='l2').fit(X_, y_)
# scoring
scores['accuracy'][step] = metrics.accuracy_score(y_, classifiers[step].predict(X_)).round(4)
scores['recall'][step] = metrics.recall_score(y_, classifiers[step].predict(X_), average='binary').round(4)
scores['precision'][step] = metrics.precision_score(y_, classifiers[step].predict(X_), average='binary').round(4)
scores['f1'][step] = metrics.f1_score(y_, classifiers[step].predict(X_), average='binary').round(4)
# covariance visualization
covariance_and_correlation(data=np.c_[X1, y], title='Centered X & Uncentered y', annot=True, fmt='.2g', cmap="RdBu")
covariance_and_correlation(data=np.c_[X2, y], title='Standardized X & Uncentered y', annot=True, fmt='.2g', cmap="RdBu")
covariance_and_correlation(data=P1, title='Principle Component of Centered X', annot=True, fmt='.2g', cmap="RdBu")
covariance_and_correlation(data=P2, title='Principle Component of Standardized X', annot=True, fmt='.2g', cmap="RdBu")
plt.tight_layout()
# regression coefficients
covariance_analysis = pd.DataFrame(
data=np.concatenate([classifier.coef_.round(0).squeeze()[:, np.newaxis] for classifier in classifiers.values()], axis=1).T,
index=classifiers.keys())
covariance_analysis.insert(0, 'accuracy', scores['accuracy'].values())
covariance_analysis.insert(1, 'recall', scores['recall'].values())
covariance_analysis.insert(2, 'precision', scores['precision'].values())
covariance_analysis.insert(3, 'f1', scores['f1'].values())
covariance_analysis
Task: Multi-class classification
from sklearn.datasets import make_classification
from sklearn import linear_model
X, y = make_classification(n_samples=3000, n_features=10, n_informative=3, n_redundant=2, n_repeated=0, n_classes=5, n_clusters_per_class=1, weights=[0.1, 0.3, 0.3, .1, .2], flip_y=0)
classifier = linear_model.LogisticRegression(solver='newton-cg', penalty='l2')
classifier.fit(X, y)
classifier.predict(X)
classifier.predict_proba(X)
Validation: LogisticRegression: multi-class classification
import joblib
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc
from sklearn.linear_model import LogisticRegression
X, y = make_classification(n_samples=3000, n_features=10, n_informative=3, n_redundant=2, n_repeated=0, n_classes=5, n_clusters_per_class=1, weights=[0.1, 0.3, 0.3, .1, .2], flip_y=0)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=None) # cross validation & randomness control
#binary_class_scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
#multi_class_scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
classifier = make_pipeline(PowerTransformer(method='yeo-johnson', standardize=True), Normalizer(), LogisticRegression())
classifier = GridSearchCV(
estimator=classifier, cv=cv,
scoring=['accuracy', 'recall_macro', 'precision_macro', 'f1_macro'][1],
param_grid={
'logisticregression__penalty' : ['l2', None][0:1],
'logisticregression__fit_intercept' : [True, False],
'logisticregression__intercept_scaling' : [1,2,3],
'logisticregression__C' : [1.0, .5, .1, .01],
# 'logisticregression__class_weight' : ['balanced', None],
'logisticregression__solver' : ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'][2:3],
},
return_train_score=True
)
classifier.fit(X, y); joblib.dump(classifier, 'classifier.joblib')
classifier = joblib.load('classifier.joblib')
classifier.cv_results_
# Evaluation
train_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('train_score') , classifier.cv_results_.items())))
train_scores = train_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=train_scores[0].to_dict())
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train_score', column.replace('_train_score', '')), train_scores.columns))
test_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('test_score') , classifier.cv_results_.items())))
test_scores = test_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=test_scores[0].to_dict())
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test_score', column.replace('_test_score', '')), test_scores.columns))
time_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('time') , classifier.cv_results_.items())))
time_scores = time_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=time_scores[0].to_dict())
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', column.replace('_time', '')), time_scores.columns))
scores = pd.concat([train_scores, test_scores, time_scores], axis=1)
scores.index = pd.MultiIndex.from_frame(pd.DataFrame(classifier.cv_results_['params']))
scores.sort_values(('test_score', 'rank'))
Covariance Analysis: LogisticRegression: multi-class classification
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn import metrics
def covariance_and_correlation(data, title=None, annot=True, fmt='.2g', cmap="RdBu",):
title = '(' + title + '): ' if title is not None else ''
fig = plt.figure(figsize=(30,30))
ax00 = plt.subplot2grid((1,2), (0,0))
ax00.set_title(title + 'Covariance')
sns.heatmap(
pd.DataFrame(data).cov().round(4),
cmap=cmap,
center=0,
square=True,
linewidths=0.5,
cbar_kws={"shrink": 0.1},
annot=annot, fmt=fmt,
ax=ax00
)
ax01 = plt.subplot2grid((1,2), (0,1))
ax01.set_title(title + 'Correlation | Standardized Features Covariance')
sns.heatmap(
pd.DataFrame(data).corr(method=['pearson', 'spearman', 'kendall'][0]).round(4),
cmap=cmap,
center=0,
square=True,
linewidths=0.5,
cbar_kws={"shrink": 0.1},
annot=annot, fmt=fmt,
ax=ax01
)
return fig
# multiclass classification
X, y = make_classification(n_samples=3000, n_features=10, n_informative=6, n_redundant=1, n_repeated=2, n_classes=5, n_clusters_per_class=1, weights=[.2, .2, .2, .2, .2], flip_y=0)
X1 = X - X.mean(axis=0) # centering
X2 = (X - X.mean(axis=0)) / X.std(ddof=1, axis=0) # standardizing
y1 = y - y.mean(axis=0) # centering
y2 = (y - y.mean(axis=0)) / y.std(ddof=1, axis=0) # standardizing
P1 = PCA().fit_transform(X1)
P2 = PCA().fit_transform(X2)
classifiers = dict(); scores = dict()
scores['accuracy'] = dict(); scores['recall'] = dict(); scores['precision'] = dict(); scores['f1'] = dict()
classification_steps = [('X1y', (X1, y)), ('X2y', (X2, y)), ('P1y', (P1, y)), ('P2y', (P2, y)),]
for step, (X_, y_) in classification_steps:
# exploratory training
classifiers[step] = LogisticRegression(solver='lbfgs', penalty='l2').fit(X_, y_)
# scoring
scores['accuracy'][step] = metrics.accuracy_score(y_, classifiers[step].predict(X_)).round(4)
scores['recall'][step] = metrics.recall_score(y_, classifiers[step].predict(X_), average='weighted').round(4)
scores['precision'][step] = metrics.precision_score(y_, classifiers[step].predict(X_), average='weighted').round(4)
scores['f1'][step] = metrics.f1_score(y_, classifiers[step].predict(X_), average='weighted').round(4)
# covariance visualization
covariance_and_correlation(data=np.c_[X1, y], title='Centered X & Centered y', annot=True, fmt='.2g', cmap="RdBu")
covariance_and_correlation(data=np.c_[X2, y], title='Standardized X & Centered y', annot=True, fmt='.2g', cmap="RdBu")
covariance_and_correlation(data=P1, title='Principle Component of Centered X', annot=True, fmt='.2g', cmap="RdBu")
covariance_and_correlation(data=P2, title='Principle Component of Standardized X', annot=True, fmt='.2g', cmap="RdBu")
plt.tight_layout()
covariance_analysis = list()
for treatment_type in classifiers.keys():
regression_coefficients = pd.DataFrame(data=classifiers[treatment_type].coef_.round(0)).reset_index().rename(columns={'index':'target'})
regression_coefficients.insert(0, 'treatment_type', treatment_type)
covariance_analysis.append(regression_coefficients)
covariance_analysis = pd.concat(covariance_analysis, axis=0).set_index(['treatment_type'])
covariance_analysis['accuracy'] = pd.Series(scores['accuracy'])
covariance_analysis['weighted_recall'] = pd.Series(scores['recall'])
covariance_analysis['weighted_precision'] = pd.Series(scores['precision'])
covariance_analysis['weighted_f1'] = pd.Series(scores['f1'])
covariance_analysis = covariance_analysis.reset_index().set_index(['treatment_type', 'target'])
covariance_analysis
Reference
'artificial intelligence > machine learning' 카테고리의 다른 글
SVM Model (0) | 2023.05.09 |
---|---|
K-Nearest and Radius Neighbor Model (0) | 2023.05.09 |
Ensemble Model (0) | 2023.05.09 |
Decision Tree, Random Forest and Extra Tree (0) | 2023.05.09 |
Naive Bayes Classifier and Discriminant Model (0) | 2023.05.09 |