-
728x90
#4/26
kmeans = KMeans(n_clusters=15, random_state=2, n_init = 10).fit(loc_df)
loc_df['label'] = kmeans.labels_ # 0,1,2...14
for label in loc_df['label'].unique():
plt.plot(loc_df['longitude'][loc_df['label'] == label], loc_df['latitude'][loc_df['label'] == label], '.', alpha = 0.3, markersize = 0.3)
df['pickup_cluster'] = kmeans.predict(df[['pickup_longitude','pickup_latitude']])
df['pickup_hour'] = df.pickup_datetime.apply(lambda x: parser.parse(x).hour ) # 시간바꾸기 2016-03-14 17:24:55 -> 17
ax.plot([0,250000],[0,250000], color = 'r', linewidth = 1) # linewidth : 선두께
# 애니메이션
from matplotlib import animation
fig, ax = plt.subplots(1, 1, figsize = (10,10))
def animate(hour):
ax.clear()
ax.set_title('Absolute Traffic - Hour ' + str(int(hour)) + ':00')
plt.figure(figsize = (10,10));
# 기본그림
for label in loc_df.label.unique():
ax.plot(loc_df.longitude[loc_df.label == label],loc_df.latitude[loc_df.label == label],'.', alpha = 1, markersize = 2, color = 'gray');
ax.plot(kmeans.cluster_centers_[label,0],kmeans.cluster_centers_[label,1],'o', color = 'r');
for label in clusters['label']: # 0~14
for dest_label in clusters['label']:
num_of_rides = len(df[(df.pickup_cluster == label) & (df.dropoff_cluster == dest_label) & (df.pickup_hour == hour)])
# A클러스터에서 B클러스터의 거리
dist_x = clusters.x[clusters.label == label].values[0] - clusters.x[clusters.label == dest_label].values[0]
dist_y = clusters.y[clusters.label == label].values[0] - clusters.y[clusters.label == dest_label].values[0]
pct = np.true_divide(num_of_rides,len(df)) # num_of_rides/len(df)
arr = Arrow(clusters.x[clusters.label == label].values, clusters.y[clusters.label == label].values, -dist_x, -dist_y, edgecolor='white', width = 15*pct)
ax.add_patch(arr) # 빈도화지에 그림넣기
arr.set_facecolor('g')
ani = animation.FuncAnimation(fig, animate, sorted(df['pickup_hour'].unique()), interval = 1000)
plt.close()
ani.save('animation.gif', writer='imagemagick', fps=2)
video = io.open('animation.gif', 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''<img src="data:image/gif;base64,{0}" type="gif" />'''.format(encoded.decode('ascii')))
dist_x = clusters.x[clusters.label == label].values[0] - clusters.x[clusters.label == dest_label].values[0]
dist_y = clusters.y[clusters.label == label].values[0] - clusters.y[clusters.label == dest_label].values[0]
neighborhood = {-74.0019368351: 'Chelsea',-73.837549761: 'Queens',-73.7854240738: 'JFK',-73.9810421975:'Midtown-North-West',-73.9862336241: 'East Village',
-73.971273324:'Midtown-North-East',-73.9866739677: 'Brooklyn-parkslope',-73.8690098118: 'LaGuardia',-73.9890572967:'Midtown',-74.0081765545: 'Downtown'
,-73.9213024854: 'Queens-Astoria',-73.9470256923: 'Harlem',-73.9555565018: 'Uppe East Side',
-73.9453487097: 'Brooklyn-Williamsburgt',-73.9745967889:'Upper West Side'}
rides_df = pd.DataFrame(columns = neighborhood.values()) # Chelsea', 'Queens', 'JFK'
rides_df['name'] = neighborhood.values()
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(np.array(list(neighborhood.keys())).reshape(-1, 1), list(neighborhood.values()))
df['pickup_neighborhood'] = neigh.predict(np.array(df.pickup_longitude).reshape(-1,1))
df['dropoff_neighborhood'] = neigh.predict(np.array(df.dropoff_longitude).reshape(-1,1))
for col in rides_df.columns[:-1]:
rides_df[col] = rides_df.name.apply(lambda x: len(df[(df.pickup_neighborhood == x) & (df.dropoff_neighborhood == col)]))728x90'<Kaggle>' 카테고리의 다른 글
2022-07-06 credit-fraud-dealing-with-imbalanced-datasets (0) 2022.07.06 캐글 커널 커리큘럼 (0) 2022.05.31 3/29 (0) 2022.03.29 2022-03-04 (0) 2022.03.15 2022-03-03 (0) 2022.03.15