-
728x90
def
모델
데이터 샘플링
변수 중요도(shap)def extract_good_features_using_shap_LGB(params, SEED): clf = lgb.LGBMClassifier(objective='multiclass', random_state=1989, silent=True, metric='multi_logloss', n_jobs=-1, n_estimators=10000, class_weight='balanced', max_depth=params['max_depth'], learning_rate=params['learning_rate'], colsample_bytree = params['colsample_bytree'], min_split_gain= params['min_split_gain'], bagging_freq = params['bagging_freq'], min_child_weight=params['min_child_weight'], num_leaves = params['num_leaves'], subsample = params['subsample'], reg_alpha= params['reg_alpha'], reg_lambda= params['reg_lambda'], num_class=len(np.unique(y)), bagging_seed=SEED, seed=SEED, ) kfold = 5 kf = StratifiedKFold(n_splits=kfold, shuffle=True) feat_importance_df = pd.DataFrame() for i, (train_index, test_index) in enumerate(kf.split(train, y)): print('='*30, '{} of {} folds'.format(i+1, kfold), '='*30) start = time.time() X_train, X_val = train.iloc[train_index], train.iloc[test_index] y_train, y_val = y.iloc[train_index], y.iloc[test_index] clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric=evaluate_macroF1_lgb, categorical_feature=categorical_feats, early_stopping_rounds=500, verbose=500) shap_values = shap.TreeExplainer(clf.booster_).shap_values(X_train) fold_importance_df = pd.DataFrame() fold_importance_df['feature'] = X_train.columns fold_importance_df['shap_values'] = abs(np.array(shap_values)[:, :].mean(1).mean(0)) fold_importance_df['feat_imp'] = clf.feature_importances_ feat_importance_df = pd.concat([feat_importance_df, fold_importance_df]) print_execution_time(start) feat_importance_df_shap = feat_importance_df.groupby('feature').mean().sort_values('shap_values', ascending=False).reset_index() # feat_importance_df_shap['shap_cumsum'] = feat_importance_df_shap['shap_values'].cumsum() / feat_importance_df_shap['shap_values'].sum() # good_features = feat_importance_df_shap.loc[feat_importance_df_shap['shap_cumsum'] < 0.999].feature return feat_importance_df_shap
728x90