4/26 :: 9566

<Kaggle> 2022. 4. 29. 14:26

728x90

#4/26
kmeans = KMeans(n_clusters=15, random_state=2, n_init = 10).fit(loc_df)
loc_df['label'] = kmeans.labels_ # 0,1,2...14
for label in loc_df['label'].unique():
    plt.plot(loc_df['longitude'][loc_df['label'] == label], loc_df['latitude'][loc_df['label'] == label], '.', alpha = 0.3, markersize = 0.3)

df['pickup_cluster'] = kmeans.predict(df[['pickup_longitude','pickup_latitude']])
df['pickup_hour'] = df.pickup_datetime.apply(lambda x: parser.parse(x).hour ) # 시간바꾸기 2016-03-14 17:24:55 -> 17

ax.plot([0,250000],[0,250000], color = 'r', linewidth = 1) # linewidth : 선두께

# 애니메이션
from matplotlib import animation
fig, ax = plt.subplots(1, 1, figsize = (10,10))

def animate(hour):
    ax.clear()
    ax.set_title('Absolute Traffic - Hour ' + str(int(hour)) + ':00')
    plt.figure(figsize = (10,10));

    # 기본그림
    for label in loc_df.label.unique():
        ax.plot(loc_df.longitude[loc_df.label == label],loc_df.latitude[loc_df.label == label],'.', alpha = 1, markersize = 2, color = 'gray');
        ax.plot(kmeans.cluster_centers_[label,0],kmeans.cluster_centers_[label,1],'o', color = 'r');
    for label in clusters['label']: # 0~14
        for dest_label in clusters['label']:
            num_of_rides = len(df[(df.pickup_cluster == label) & (df.dropoff_cluster == dest_label) & (df.pickup_hour == hour)])
            # A클러스터에서 B클러스터의 거리
            dist_x = clusters.x[clusters.label == label].values[0] - clusters.x[clusters.label == dest_label].values[0]
            dist_y = clusters.y[clusters.label == label].values[0] - clusters.y[clusters.label == dest_label].values[0]

            pct = np.true_divide(num_of_rides,len(df)) # num_of_rides/len(df)
            arr = Arrow(clusters.x[clusters.label == label].values, clusters.y[clusters.label == label].values, -dist_x, -dist_y, edgecolor='white', width = 15*pct)
            ax.add_patch(arr) # 빈도화지에 그림넣기
            arr.set_facecolor('g')

ani = animation.FuncAnimation(fig, animate, sorted(df['pickup_hour'].unique()), interval = 1000)
plt.close()
ani.save('animation.gif', writer='imagemagick', fps=2)
video = io.open('animation.gif', 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''<img src="data:image/gif;base64,{0}" type="gif" />'''.format(encoded.decode('ascii')))

            dist_x = clusters.x[clusters.label == label].values[0] - clusters.x[clusters.label == dest_label].values[0]
            dist_y = clusters.y[clusters.label == label].values[0] - clusters.y[clusters.label == dest_label].values[0]

neighborhood = {-74.0019368351: 'Chelsea',-73.837549761: 'Queens',-73.7854240738: 'JFK',-73.9810421975:'Midtown-North-West',-73.9862336241: 'East Village',
                -73.971273324:'Midtown-North-East',-73.9866739677: 'Brooklyn-parkslope',-73.8690098118: 'LaGuardia',-73.9890572967:'Midtown',-74.0081765545: 'Downtown'
                ,-73.9213024854: 'Queens-Astoria',-73.9470256923: 'Harlem',-73.9555565018: 'Uppe East Side',
               -73.9453487097: 'Brooklyn-Williamsburgt',-73.9745967889:'Upper West Side'}

rides_df = pd.DataFrame(columns = neighborhood.values()) # Chelsea', 'Queens', 'JFK'
rides_df['name'] = neighborhood.values()

neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(np.array(list(neighborhood.keys())).reshape(-1, 1), list(neighborhood.values()))

df['pickup_neighborhood'] = neigh.predict(np.array(df.pickup_longitude).reshape(-1,1))
df['dropoff_neighborhood'] = neigh.predict(np.array(df.dropoff_longitude).reshape(-1,1))

for col in rides_df.columns[:-1]:
    rides_df[col] = rides_df.name.apply(lambda x: len(df[(df.pickup_neighborhood == x) & (df.dropoff_neighborhood == col)]))

728x90

저작자표시

'<Kaggle>' 카테고리의 다른 글

2022-07-06 credit-fraud-dealing-with-imbalanced-datasets (0)	2022.07.06
캐글 커널 커리큘럼 (0)	2022.05.31
3/29 (0)	2022.03.29
2022-03-04 (0)	2022.03.15
2022-03-03 (0)	2022.03.15

ABOUT ME

9566

'<Kaggle>' 카테고리의 다른 글

티스토리툴바

ABOUT ME

'<Kaggle>' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바