카테고리 없음
4/8
9566
2022. 4. 8. 20:51
728x90
def
모델
데이터 샘플링
변수 중요도(shap)
def extract_good_features_using_shap_LGB(params, SEED):
clf = lgb.LGBMClassifier(objective='multiclass',
random_state=1989,
silent=True,
metric='multi_logloss',
n_jobs=-1, n_estimators=10000,
class_weight='balanced',
max_depth=params['max_depth'],
learning_rate=params['learning_rate'],
colsample_bytree = params['colsample_bytree'],
min_split_gain= params['min_split_gain'],
bagging_freq = params['bagging_freq'],
min_child_weight=params['min_child_weight'],
num_leaves = params['num_leaves'],
subsample = params['subsample'],
reg_alpha= params['reg_alpha'],
reg_lambda= params['reg_lambda'],
num_class=len(np.unique(y)),
bagging_seed=SEED,
seed=SEED,
)
kfold = 5
kf = StratifiedKFold(n_splits=kfold, shuffle=True)
feat_importance_df = pd.DataFrame()
for i, (train_index, test_index) in enumerate(kf.split(train, y)):
print('='*30, '{} of {} folds'.format(i+1, kfold), '='*30)
start = time.time()
X_train, X_val = train.iloc[train_index], train.iloc[test_index]
y_train, y_val = y.iloc[train_index], y.iloc[test_index]
clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric=evaluate_macroF1_lgb, categorical_feature=categorical_feats,
early_stopping_rounds=500, verbose=500)
shap_values = shap.TreeExplainer(clf.booster_).shap_values(X_train)
fold_importance_df = pd.DataFrame()
fold_importance_df['feature'] = X_train.columns
fold_importance_df['shap_values'] = abs(np.array(shap_values)[:, :].mean(1).mean(0))
fold_importance_df['feat_imp'] = clf.feature_importances_
feat_importance_df = pd.concat([feat_importance_df, fold_importance_df])
print_execution_time(start)
feat_importance_df_shap = feat_importance_df.groupby('feature').mean().sort_values('shap_values', ascending=False).reset_index()
# feat_importance_df_shap['shap_cumsum'] = feat_importance_df_shap['shap_values'].cumsum() / feat_importance_df_shap['shap_values'].sum()
# good_features = feat_importance_df_shap.loc[feat_importance_df_shap['shap_cumsum'] < 0.999].feature
return feat_importance_df_shap
728x90