SHIN, Byeong-Chun's Board

Name

신병춘

Subject

k-means clustering

File

k_means.py (3.8 KB) - Download : 245

# k_means.py
# K-means Clustering(군집화)

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

num_puntos = 2000
conjunto_puntos = []
for i in range(num_puntos):
    if np.random.random() > 0.5:
        conjunto_puntos.append([np.random.normal(0.0, 0.9),
                                np.random.normal(0.0, 0.9)])
    else:
        conjunto_puntos.append([np.random.normal(3.0, 0.5),
                                np.random.normal(1.0, 0.5)])

sess = tf.Session()
''' 4 가지 분류 문제 '''
k = 4

''' Create a Tensor '''
vectors = tf.constant(conjunto_puntos)

''' Create a k initial means of shape (4,2) '''
# tf.slice:    Extracts a slice from a tensor.
centroides = tf.Variable(tf.slice(tf.random_shuffle(vectors),[0,0],[k,-1])) # (질량)중심

''' Increasing the dimensions '''
# vectors.shape = (2000,2) => expanded_vector.shape = (1, 2000, 2)
expanded_vectors = tf.expand_dims(vectors, 0)
# centroides.shape = (4,2) => expanded_centroides.shape = (4, 1, 2)
expanded_centroides = tf.expand_dims(centroides, 1)

''' Initial classification '''
# 각 좌표와 4-평균좌표와의 거리를 계산하여 가장 가까운 것으로 분류
# tf.square(tf.subtract(expanded_vectors, expanded_centroides)): shape = (4, 2000, 2)
# tf.reduce_sum( ans, 2 ): shape = (4, 2000) :  거리 게산
# assignments = tf.argmin( ans, 0 ): shape = (2000, ) : min_index of 4 elements
#          : 최소거리 인덱스 선택
assignments = tf.argmin(tf.reduce_sum(
                          tf.square( tf.subtract(expanded_vectors,expanded_centroides) ),
                                    2),
                      0)

''' Find 4 means '''
# tf.where( tf.equal( A, B) ) => A와 B가 같은 위치의 index
# ind = tf.reshape( ans, [1, -1] ) => shape = (1, 362)
# g(c) = tf.gather(vectors, ind) => vectors[ ind ] : (1, 362, 2) : 362행을 모아 행렬을 추가 3차원
#   vectors[ ind0 ] : (362, 2)  2차원  if ind0.shape = (362, )
# m(c) = tf.reduce_mean( g(c) , reduction_indices=[1] ) => shape = (1,2)
# [tf.reduce_mean( g(c), reduction_indices=[1] ) for c in range(k)] => 4개의 (1, 2)-array의 리스트
# means = tf.concat( ans, axis = 0) => shape = (4, 2) 열로 붙이기 : np.concatenate(ans, 0)
# means1 = tf.concat( ans, axis = 1) => shape = (1, 4) 행으로 붙이기
means = tf.concat( [tf.reduce_mean(
                       tf.gather( vectors, tf.reshape(tf.where( tf.equal(assignments, c)),[1,-1]) ),
                                 reduction_indices=[1] )
                    for c in range(k) ],
              axis=0 )

''' Update the centroides by 4-means '''
# Update 'ref' by assigning 'value' to it.
update_centroides = tf.assign(centroides, means)

''' Initialize the tf-variables '''
init_op = tf.initialize_all_variables()
sess.run(init_op)

''' 100 Iteration to update '''
for step in range(100):
    _, centroid_values, assignment_values = sess.run([update_centroides, centroides, assignments])

''' Constructing a dictionary to plot '''
data = {"x": [], "y": [], "cluster": []}
for i in range(len(assignment_values)):
    data["x"].append(conjunto_puntos[i][0])
    data["y"].append(conjunto_puntos[i][1])
    data["cluster"].append(assignment_values[i])

''' Plot using seaborn '''
df = pd.DataFrame(data)
m = sess.run(means)
# DataFrame Plot: linear-regression plot
#sns.lmplot("x", "y", data=df, fit_reg=False, size=6, legend=False)
#plt.plot(m[:,0], m[:,1], 'kd')
sns.lmplot("x", "y", data=df, fit_reg=False, size=6, hue="cluster", legend=False)
plt.plot(m[:,0], m[:,1], 'kd')
# sns.lmplot("x", "y", data=df, fit_reg=False, size=6, hue="y", legend=False)
#sns.lmplot("x", "y", data=df, fit_reg=False, size=4, hue="cluster", col="cluster", legend=False)
#plt.plot(m[:,0], m[:,1], 'kd')

DATE: 2019.07.04 - 14:04

class 예제