K-Nearest Neighbor Model
Overview
from sklearn.datasets import make_classification, make_regression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
X, y = make_classification(n_samples=3000, n_features=10, n_classes=3, n_clusters_per_class=1, weights=[.6, .3, .1], flip_y=0)
classifier = KNeighborsClassifier()
classifier.fit(X, y)
classifier.predict(X)
classifier.predict_proba(X)
X, y = make_regression(n_samples=3000, n_features=10, n_informative=5, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, random_state=None)
regressor = KNeighborsRegressor()
regressor.fit(X, y)
regressor.predict(X)
Task: Classification
Validation: KNeighborsClassifier: binary classification
import joblib
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc
from sklearn.neighbors import KNeighborsClassifier
X, y = make_classification(n_samples=3000, n_features=10, n_classes=2, n_clusters_per_class=1, weights=[0.6, 0.4], flip_y=0)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=None) # cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=None) # cross validation & randomness control
#binary_class_scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
#multi_class_scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
classifier = make_pipeline(PowerTransformer(method='yeo-johnson', standardize=True), KNeighborsClassifier(metric='minkowski'))
classifier = GridSearchCV(
estimator=classifier, cv=cv,
scoring=['accuracy', 'recall', 'precision', 'f1'][0],
param_grid={
'kneighborsclassifier__n_neighbors': [5, 10, 20, 30],
'kneighborsclassifier__weights': ['uniform', 'distance'],
'kneighborsclassifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'][1:3],
'kneighborsclassifier__p' : [1, 2],
'kneighborsclassifier__leaf_size' : [30, 50],
},
return_train_score=True
)
classifier.fit(X, y); joblib.dump(classifier, 'classifier.joblib')
classifier = joblib.load('classifier.joblib')
classifier.cv_results_
# Evaluation
train_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('train_score') , classifier.cv_results_.items())))
train_scores = train_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=train_scores[0].to_dict())
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train_score', column.replace('_train_score', '')), train_scores.columns))
test_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('test_score') , classifier.cv_results_.items())))
test_scores = test_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=test_scores[0].to_dict())
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test_score', column.replace('_test_score', '')), test_scores.columns))
time_scores = pd.DataFrame(list(filter(lambda score: score[0].endswith('time') , classifier.cv_results_.items())))
time_scores = time_scores[1].apply(lambda x: pd.Series(x)).T.rename(columns=time_scores[0].to_dict())
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', column.replace('_time', '')), time_scores.columns))
scores = pd.concat([train_scores, test_scores, time_scores], axis=1)
scores.index = pd.MultiIndex.from_frame(pd.DataFrame(classifier.cv_results_['params']))
scores.sort_values(('test_score', 'rank'))
Preprocessing effect: KNeighborsClassifier: binary classification
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler, Normalizer, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import OneHotEncoder, Binarizer, KBinsDiscretizer, PolynomialFeatures, SplineTransformer
def scoring(classifier, X, y, preprocessor_name, task_type, random_state=None):
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold, RepeatedKFold
if task_type == 'binary':
scoring = ['accuracy', 'balanced_accuracy', 'recall', 'average_precision', 'precision', 'f1', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=random_state) # StratifiedKFold(n_splits=5, shuffle=False, random_state=None) # cross validation & randomness control
elif task_type == 'multi':
scoring = ['accuracy', 'balanced_accuracy', 'recall_micro', 'recall_macro', 'recall_weighted', 'precision_micro', 'precision_macro', 'precision_weighted', 'f1_micro', 'f1_macro', 'f1_weighted', 'jaccard_micro', 'jaccard_macro', 'jaccard_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=random_state) # StratifiedKFold(n_splits=5, shuffle=False, random_state=None) # cross validation & randomness control
elif task_type == 'reg':
scoring = ['r2', 'explained_variance', 'max_error', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error', 'neg_median_absolute_error', 'neg_mean_absolute_percentage_error']
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=random_state) # KFold(n_splits=5, shuffle=False, random_state=None)
scores = pd.DataFrame(cross_validate(classifier, X, y, cv=cv, scoring=scoring, return_train_score=True)).mean()
scores.name = preprocessor_name
return scores
def scoring_summary(scores):
# summary
train_scores = scores[list(filter(lambda column: column.startswith('train'), scores.columns))]
test_scores = scores[list(filter(lambda column: column.startswith('test'), scores.columns))]
time_scores = scores[list(filter(lambda column: column.endswith('time'), scores.columns))]
train_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('train', '_'.join(column.split('_')[1:])), train_scores.columns))
test_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('test', '_'.join(column.split('_')[1:])), test_scores.columns))
time_scores.columns = pd.MultiIndex.from_tuples(map(lambda column: ('time', '_'.join(column.split('_')[:-1])), time_scores.columns))
scores = pd.concat([train_scores, test_scores, time_scores], axis=1).swaplevel(0,1,axis=1)
return scores
random_state = None; task_type = 'binary'
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, weights=[0.9, 0.1], flip_y=0, random_state=random_state)
# baseline
scores = list()
params = dict()
scores.append(scoring(KNeighborsClassifier(**params), X, y, preprocessor_name='baseline', task_type=task_type, random_state=random_state))
# transform of measure
scores.append(scoring(KNeighborsClassifier(**params), PCA(n_components=None).fit_transform(X), y, preprocessor_name='PCA', task_type=task_type, random_state=random_state))
scores.append(scoring(KNeighborsClassifier(**params), FactorAnalysis(n_components=None, rotation='varimax').fit_transform(X), y, preprocessor_name='FactorAnalysis', task_type=task_type, random_state=random_state))
scores.append(scoring(KNeighborsClassifier(**params), QuantileTransformer(output_distribution='normal').fit_transform(X), y, preprocessor_name='QuantileTransformer', task_type=task_type, random_state=random_state))
scores.append(scoring(KNeighborsClassifier(**params), PowerTransformer(method='yeo-johnson', standardize=True).fit_transform(X), y, preprocessor_name='PowerTransform', task_type=task_type, random_state=random_state))
scores.append(scoring(KNeighborsClassifier(**params), Normalizer().fit_transform(X), y, preprocessor_name='Normalizer', task_type=task_type, random_state=random_state))
scores.append(scoring(KNeighborsClassifier(**params), StandardScaler().fit_transform(X), y, preprocessor_name='StandardScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(KNeighborsClassifier(**params), MinMaxScaler().fit_transform(X), y, preprocessor_name='MinMaxScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(KNeighborsClassifier(**params), MaxAbsScaler().fit_transform(X), y, preprocessor_name='MaxAbsScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(KNeighborsClassifier(**params), RobustScaler().fit_transform(X), y, preprocessor_name='RobustScaler', task_type=task_type, random_state=random_state))
scores.append(scoring(KNeighborsClassifier(**params), KBinsDiscretizer(n_bins=[3]*10, encode='ordinal').fit_transform(X), y, preprocessor_name='KBinsDiscretizer', task_type=task_type, random_state=random_state))
# transform of sigma-algebra
scores.append(scoring(KNeighborsClassifier(**params), Binarizer(threshold=0).fit_transform(X), y, preprocessor_name='Binarizer', task_type=task_type, random_state=random_state))
scores.append(scoring(KNeighborsClassifier(**params), PolynomialFeatures(degree=2, interaction_only=False, include_bias=True).fit_transform(X), y, preprocessor_name='PolynomialFeatures', task_type=task_type, random_state=random_state))
scores.append(scoring(KNeighborsClassifier(**params), SplineTransformer(degree=2, n_knots=3).fit_transform(X), y, preprocessor_name='SplineTransformer', task_type=task_type, random_state=random_state))
scores = pd.concat(scores, axis=1).T
# summary
scores = scoring_summary(scores)
scores[['accuracy', 'recall', 'precision', 'f1']]
Radius Neighbor Model
Overview
from sklearn.datasets import make_classification, make_regression
from sklearn.neighbors import RadiusNeighborsClassifier, RadiusNeighborsRegressor
X, y = make_classification(n_samples=3000, n_features=10, n_classes=3, n_clusters_per_class=1, weights=[.6, .3, .1], flip_y=0)
classifier = RadiusNeighborsClassifier()
classifier.fit(X, y)
classifier.predict(X)
classifier.predict_proba(X)
X, y = make_regression(n_samples=3000, n_features=10, n_informative=5, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, random_state=None)
regressor = RadiusNeighborsRegressor()
regressor.fit(X, y)
regressor.predict(X)
Validation
#
Reference
'artificial intelligence > machine learning' 카테고리의 다른 글
Perceptron and Neural Network Model (0) | 2023.05.09 |
---|---|
SVM Model (0) | 2023.05.09 |
Ensemble Model (0) | 2023.05.09 |
Decision Tree, Random Forest and Extra Tree (0) | 2023.05.09 |
Naive Bayes Classifier and Discriminant Model (0) | 2023.05.09 |