sklearnのutilsにall_estimatorsという関数があります。これでtype_filterをするとclassifierを全部取得できることがわかりました。ちまちま普段使いのモデルを当てていかなくても自動で全部当ててくれます。

ちなみにauto-sklearnというautoMLライブラリがありますが、普通にはwindowsで動きません。ただ単にsklearnの中でいいやつを探したいライトユースのautoMLなら自分で書いてしまいましょう。

sklearn all_estimatorsで取得できるモデルについて

all_estimatorsの引数type_filterにclassifierを渡します

注:all_estimatorsは1.2.2では以下のように場所が変更されています

from sklearn.utils.discovery import all_estimators

from sklearn.utils import all_estimators
all_estimators(type_filter="classifier")

[(‘AdaBoostClassifier’, sklearn.ensemble._weight_boosting.AdaBoostClassifier),
(‘BaggingClassifier’, sklearn.ensemble._bagging.BaggingClassifier),
(‘BernoulliNB’, sklearn.naive_bayes.BernoulliNB),
(‘CalibratedClassifierCV’, sklearn.calibration.CalibratedClassifierCV),
(‘CategoricalNB’, sklearn.naive_bayes.CategoricalNB),
(‘ClassifierChain’, sklearn.multioutput.ClassifierChain),
(‘ComplementNB’, sklearn.naive_bayes.ComplementNB),
(‘DecisionTreeClassifier’, sklearn.tree._classes.DecisionTreeClassifier),
(‘DummyClassifier’, sklearn.dummy.DummyClassifier),
(‘ExtraTreeClassifier’, sklearn.tree._classes.ExtraTreeClassifier),
(‘ExtraTreesClassifier’, sklearn.ensemble._forest.ExtraTreesClassifier),
(‘GaussianNB’, sklearn.naive_bayes.GaussianNB),
(‘GaussianProcessClassifier’,
sklearn.gaussian_process._gpc.GaussianProcessClassifier),
(‘GradientBoostingClassifier’,
sklearn.ensemble._gb.GradientBoostingClassifier),
(‘HistGradientBoostingClassifier’,
sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier),
(‘KNeighborsClassifier’,
sklearn.neighbors._classification.KNeighborsClassifier),
(‘LabelPropagation’,
sklearn.semi_supervised._label_propagation.LabelPropagation),
(‘LabelSpreading’, sklearn.semi_supervised._label_propagation.LabelSpreading),
(‘LinearDiscriminantAnalysis’,
sklearn.discriminant_analysis.LinearDiscriminantAnalysis),
(‘LinearSVC’, sklearn.svm._classes.LinearSVC),
(‘LogisticRegression’, sklearn.linear_model._logistic.LogisticRegression),
(‘LogisticRegressionCV’, sklearn.linear_model._logistic.LogisticRegressionCV),
(‘MLPClassifier’,
sklearn.neural_network._multilayer_perceptron.MLPClassifier),
(‘MultiOutputClassifier’, sklearn.multioutput.MultiOutputClassifier),
(‘MultinomialNB’, sklearn.naive_bayes.MultinomialNB),
(‘NearestCentroid’, sklearn.neighbors._nearest_centroid.NearestCentroid),
(‘NuSVC’, sklearn.svm._classes.NuSVC),
(‘OneVsOneClassifier’, sklearn.multiclass.OneVsOneClassifier),
(‘OneVsRestClassifier’, sklearn.multiclass.OneVsRestClassifier),
(‘OutputCodeClassifier’, sklearn.multiclass.OutputCodeClassifier),
(‘PassiveAggressiveClassifier’,
sklearn.linear_model._passive_aggressive.PassiveAggressiveClassifier),
(‘Perceptron’, sklearn.linear_model._perceptron.Perceptron),
(‘QuadraticDiscriminantAnalysis’,
sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis),
(‘RadiusNeighborsClassifier’,
sklearn.neighbors._classification.RadiusNeighborsClassifier),
(‘RandomForestClassifier’, sklearn.ensemble._forest.RandomForestClassifier),
(‘RidgeClassifier’, sklearn.linear_model._ridge.RidgeClassifier),
(‘RidgeClassifierCV’, sklearn.linear_model._ridge.RidgeClassifierCV),
(‘SGDClassifier’, sklearn.linear_model._stochastic_gradient.SGDClassifier),
(‘SVC’, sklearn.svm._classes.SVC),
(‘StackingClassifier’, sklearn.ensemble._stacking.StackingClassifier),
(‘VotingClassifier’, sklearn.ensemble._voting.VotingClassifier)]

全てのclassifierを試す自作関数

k-foldクロスバリデーション入りです。この関係で、cvは2以上にしないとエラーがでます。評価関数は、accuracyを使っていますが必要に応じて直してください。

def apply_all_classifier(x,y,cv=3):
    """
    apply all classifiers in sklearn with k-fold cross validation
    cv should be greater than or equal to 2

    Args:
        x (_type_): independent variable
        y (_type_): target variable
        cv (int, optional): how many folds. Defaults to 3.
    """
    
    import time
    from sklearn.utils import all_estimators
    from sklearn.metrics import accuracy_score
       
    results_df = pd.DataFrame(columns=["model_name","score","timeElapsed","cv"])
    
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=cv,shuffle=True)
    
    for i, (train_index, test_index) in enumerate(kf.split(X=x)):
    
        x_train = x[train_index]
        x_test = x[test_index]
        y_train = y[train_index]
        y_test = y[test_index]
        
        for model in all_estimators(type_filter="classifier"):
            results_df.to_csv("results.csv",index=False)
            time_start = time.time()
            try:
                thismodel = model[1]()
                print(model[0])
                try:
                    thismodel.fit(x_train,y_train)
                    result = accuracy_score(y_true=y_test,
                                            y_pred=thismodel.predict(x_test))
                    print(result)
                    time_spent = time.time()-time_start
                    results_df = pd.concat([results_df,
                                            pd.DataFrame({"model_name":model[0],
                                                        "score":[result],
                                                        "timeElapsed":[time_spent],
                                                        "cv":[i]})],
                                        axis=0)
                except:
                    print("fitting error")
                    results_df = pd.concat([results_df,
                                            pd.DataFrame({"model_name":model[0],
                                                        "score":["fittingError"],
                                                        "timeElapsed":[0],
                                                        "cv":[i]})],
                                        axis=0)
            except:
                print("instanciating error")
                results_df=pd.concat([results_df,
                                            pd.DataFrame({"model_name":model[0],
                                                        "score":["instanciatingError"],
                                                        "timeElapsed":[0],
                                                        "cv":[i]})],
                                        axis=0)

    temp = results_df.pivot(index="model_name",columns="cv")
    temp = temp.reset_index()
    temp.columns = temp.columns.values
    temp.columns = ["model_name"]+temp.columns[1:].to_list()
    results_df["mean_score"] = pd.to_numeric(results_df["score"],errors="coerce")
    results_df["mean_time"] = pd.to_numeric(results_df["timeElapsed"],errors="coerce")
    results_df = (results_df.groupby("model_name"))[["mean_score","mean_time"]].agg("mean")
    results_df = results_df.sort_values(by="mean_score",ascending=False)
    results_df = results_df.reset_index()
    results_df = results_df.merge(temp,on=["model_name"])
    results_df.to_csv("results_classifier.csv",index=False)
    print("Done!")

試しにsklearnの乳がんのデータセットを使ってみます。

使い方は、apply_all_classifier(説明変数、目的変数、kフォルドクロスバリデーションの回数)です。

import pandas as pd
from sklearn.datasets  import load_breast_cancer
x,y = load_breast_cancer(return_X_y=True)
apply_all_classifier(x,y,cv=3)

Categories:

category