今回はランダムフォレストを試してみる。

Pythonでの実装コード

Pythonでは非常に簡単に実装できる。

from sklearn.ensemble import RandomForestClassifier
mod1 = RandomForestClassifier(n_estimators = 100
                              ,criterion = "gini"
                              ,max_depth = i_integer
                              ,max_features = "auto"
                              ,bootstrap = True
                              ,random_state = 1)

クロスバリデーションも簡単（例は決定木とランダムフォレストをグリッドサーチで実装）。

import pandas as pd
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
 
modfnc1 = [DecisionTreeClassifier(), RandomForestClassifier()]
prm1 = [{"max_depth": stats.randint(1,6)
         ,"criterion":["gini","entropy"]
         ,"random_state":[1]}
        ,{"n_estimators": stats.randint(10,201)
         ,"max_depth": stats.randint(1,6)
         ,"criterion":["gini","entropy"]
         ,"max_features":["auto", "sqrt", "log2"]
         ,"random_state":[1]}]
for modfnc1, prm1 in zip(modfnc1, prm1):
    clf = model_selection.RandomizedSearchCV(modfnc1, prm1)
    clf.fit(X_train, y_train)
    y_pred1 = clf.predict(X_test)
    print(clf.best_estimator_)
    print(confusion_matrix(y_test, y_pred1))
    print(classification_report(y_test, y_pred1))
    print()

SASでの実装コード

ホールドアウトでのモデル作成は次の通り。
任意の訓練データと検証データを用いたい場合、optionに scoreprole = valid を付け、partition statement で指定する。

proc hpforest data = 訓練データ seed = 19 scoreprole = valid
    maxtrees = 100 /* pythonではn_estimators */
    maxdepth = 6   /* pythonではmax_depth */
    ;
 
    partition roleVar=selected(train='0' valid='1');
    performance details;
 
    target 目的変数 / level = 変数のタイプ;
    input 特徴量1, ... , 特徴量p;
 
    ods output FitStatistics      = _Res_FitStatistics;      /* 予測値の出力 */
    ods output VariableImportance = _Res_VariableImportance; /* 変数重要度（特徴量重要度）の出力 */
    save file = "hpforest_fit.bin"; /* 将来データ用に予測アルゴリズム出力 */
run;

SASに任せる場合は trainfraction=訓練データの割合 とすればプロシジャ内でデータ分割してくれる。

proc hpforest data = 訓練データ seed = 19 trainfraction=0.8
    maxtrees = 100 /* pythonではn_estimators */
    maxdepth = 6   /* pythonではmax_depth */
    ;
    ...
run;

なお、hpforest プロシジャでは save file = "出力ファイル名.bin" を付けることでモデルのアルゴリズムを出力できる。
この出力ファイルを hp4score に渡せば、新しいデータで予測ができる。

proc hp4score data = alldata;
    score file="hpforest_fit.bin" out=_Pred_RandomForest;
    id PassengerId selected;
run;

SASでお試し（対象データ、データ加工）

今回もKaggleチュートリアルのタイタニックデータを用いる。

データ加工は以前と同じなのでコードをたたんでいる。
【SAS】hpgenselect プロシジャ_罰則付きロジスティック回帰（LASSO） - こちにぃるの日記

★コードを見る場合ここをクリック★

/*---------------------------------------------------
    欠損値処理、変数作成
---------------------------------------------------*/
*-- 数値型：欠損値を中央値に置き換え;
proc stdize data=train out=train method=median reponly;
    var age;
run;

*-- カテゴリ型：欠損値を最頻値に置き換え;
data train;
    set train;
    Embarked = coalescec(Embarked,"S");
run;

/*---------------------------------------------------
    データ分割
---------------------------------------------------*/
proc surveyselect data=train rate=0.2 out=train outall method=srs seed=19;
run;
data train(drop=selected) test(drop=selected);
    set train;
    if selected=1 then output test; else output train;
run;

proc surveyselect data=train rate=0.2 out=train outall method=srs seed=19;
run;

proc format;
    value rollf 0="train-data" 1="validation-data" 2="test-data";
run;
data alldata;
   set train test(in=in1);
   format selected rollf.; *-- フォーマット適用;
   if in1 then selected=2; *-- テストデータに番号振る;
run;

SASでお試し（モデル作成）

ランダムフォレストの実行。

proc hpforest data=train seed=19 scoreprole=valid
    maxtrees    = 100 /* pythonではn_estimators */
    maxdepth    = 6   /* pythonではmax_depth */
    vars_to_try = 4 
    ;
 
    partition roleVar=selected(train='0' valid='1');
    performance details;
 
    target Survived / level = binary;
    input Pclass Sex Age Fare Embarked SibSp Parch;
 
    ods output FitStatistics      = _Res_FitStatistics;
    ods output VariableImportance = _Res_VariableImportance;
    save file="hpforest_fit.bin"; 
run;

f:id:cochineal19:20210712224857p:plain:w400

変数重要度を図示する。

title "Feature Importance";
proc sgplot data=_Res_VariableImportance;
    hbar Variable /response=Gini  groupdisplay=cluster categoryorder=respdesc;
run;

f:id:cochineal19:20210712224944p:plain:w400

作成モデルで予測する。

proc hp4score data = alldata;
    score file="hpforest_fit.bin" out=_Pred_RandomForest;
    id PassengerId selected;
run;
 
data _Pred_RandomForest;
    set _Pred_RandomForest;
    if P_Survived1 > 0.5 then U_Survived = 1; else U_Survived = 0;
run;

SASでお試し（モデル評価）

評価指標を自作し、スコアを算出する。

%macro _Scores(inds=, tlabel=Survived, pred=U_Survived, group=selected);
    proc sql;
        create table &inds._Scores as
            select
                 %if &group.^="" %then %do; &group. , %end;
                 count(*) as N
                ,sum(case when &tlabel.=1 and &pred.=1 then 1 else 0 end) as TP
                ,sum(case when &tlabel.=1 and &pred.=0 then 1 else 0 end) as FN
                ,sum(case when &tlabel.=0 and &pred.=1 then 1 else 0 end) as FP
                ,sum(case when &tlabel.=0 and &pred.=0 then 1 else 0 end) as TN
                ,(calculated TP + calculated TN) / count(*)      as Accuracy    /*正診率(Accuracy)*/
                ,calculated TP / (calculated TP + calculated FN) as Recall      /*検出率(Recall) 、類語：感度*/
                ,calculated TN / (calculated FP + calculated TN) as Specificity /*特異度(Specificity)*/
                ,calculated TP / (calculated TP + calculated FP) as Precision   /*適合率(Precision)*/
                ,2 * calculated Recall * calculated Precision / (calculated Recall + calculated Precision) as f1_score /*F1スコア*/
            from &inds.
             %if &group.^="" %then %do; group by &group. %end;
        ;
    quit;
    
    title "&inds.";
    proc print data=&inds._Scores; run;
    title "";
%mend _Scores;
%_Scores(inds=_Pred_RandomForest);

f:id:cochineal19:20210712225319p:plain

今回の f1スコアは 0.709 くらい。
この前 hpgenselect プロシジャで作成したモデル（Lasso ロジスティック回帰）よりは良い予測性能。
【SAS】hpgenselect プロシジャ_罰則付きロジスティック回帰（LASSO） - こちにぃるの日記

ハイパーパラメータはあまりいじってないので、ここをいじれば性能は良くなるかもしれない。
なお、forest プロシジャではハイパーパラメータを自動調整してくれる autoturn statement があるようだが、hpforest プロシジャに同様の機能があるかは調べられていない。
SAS Help Center