-
728x90
# 4/13
corr = train['TARGET'].corr(train['EXT_SOURCE_3']) = corr = train['EXT_SOURCE_3'].corr(train['TARGET']) = -0.17891869762837073
bureau_agg = bureau.drop(columns = ['SK_ID_BUREAU']).groupby('SK_ID_CURR', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
columns = ['SK_ID_CURR']
for var in bureau_agg.columns.levels[0]:
if var != 'SK_ID_CURR':
for stat in bureau_agg.columns.levels[1][:-1]: # 마지막 변수제거
columns.append('bureau_%s_%s' % (var, stat))
bureau_agg.columns = columns
train = train.merge(bureau_agg, on = 'SK_ID_CURR', how = 'left')
+ new_corrs = sorted(new_corrs, key = lambda x: abs(x[1]), reverse = True)
더미화 + .agg() + align()
숫자형변수 + .agg() + levels + merge
corr() + sns.kdeplot/heatmap() + median() + drop
feature_importances_ + scatter/bar + dropzero + selectfrommodel
y를 제외한 변수간 상관관계(>0.8)는 삭제 : corr() + 빈dict/list + index +items + 두for문
corrs = train.corr()
above_threshold_vars = {}
for col in corrs:
above_threshold_vars[col] = list(corrs.index[corrs[col] > 0.8])
cols_to_remove = []
cols_seen = []
for key, value in above_threshold_vars.items():
cols_seen.append(key)
for x in value:
if x == key:
next
else:
if x not in cols_seen:
cols_to_remove.append(x)
cols_to_remove = list(set(cols_to_remove)) # 중복제거
train_corrs_removed = train.drop(columns = cols_to_remove)
msno.matrix(df=train_copy.iloc[:,2:39], figsize=(20, 14), color=(0.42, 0.1, 0.05))
train_copy = train_copy.replace(-1, np.NaN)
data = [go.Bar(x = train["target"].value_counts().index.values, # array([0, 1], dtype=int64)
y = train["target"].value_counts().values, # (array([573518, 21694], dtype=int64),)
text='Distribution of target variable')]
layout = go.Layout(title='Target variable distribution')
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='basic-bar')
bin_col = [col for col in train.columns if '_bin' in col]
zero_list = []
one_list = []
for col in bin_col:
zero_list.append((train[col]==0).sum())
one_list.append((train[col]==1).sum())
trace1 = go.Bar(x=bin_col, y=zero_list, name='Zero count')
trace2 = go.Bar(x=bin_col, y=one_list, name='One count')
data = [trace1, trace2]
layout = go.Layout(barmode='stack', title='Count of 1 and 0 in binary variables')
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='stacked-bar')728x90