¸ÞÀϺ¸³»±â

À̸§°Ë»ö

::: Research Board :::


9 11 Åë°èÄ«¿îÅÍ º¸±â   ȸ¿ø °¡ÀÔ È¸¿ø ·Î±×ÀÎ °ü¸®ÀÚ Á¢¼Ó --+
Name   ½Åº´Ãá
Subject   k-means clustering for the classification of iris


# k_means_iris.py
# K-means Clustering(±ºÁýÈ­) for iris

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
# from iris_ready import *   # load dataset from iris_ready.py

def distance(A, B): # column-wise, broadcasting
    return np.sqrt( np.sum((A - B)**2, axis=1) )

''' Call data '''
from sklearn.datasets import load_iris
dataset = load_iris()    # dict
vectors_org, target = np.array(dataset['data']), np.array(dataset['target'])

''' »ç¿ëÇÒ º¯¼ö ¼±Åà '''
''' imode = 0: ¸ðµÎ, 1: ²ÉÀÙ, 2: ²É¹Þħ, 3: ±æÀÌ, 4: µÎ²² '''
imode = np.int(input('imode( 0: ¸ðµÎ, 1: ²ÉÀÙ, 2: ²É¹Þħ, 3: ±æÀÌ, 4: µÎ²² )  = '))
nmode = np.int(input('nmode( Á¤±ÔÈ­ 0: Çϱâ, 1: ¾ÈÇϱâ )  = '))
gmode = np.int(input('gmode( 1: °á°ú¸¸ º¸±â,  0: ±×¸²µµ º¸±â )  = '))

if imode == 0:  # 4°³ÀÇ º¯¼ö »ç¿ë: ²ÉÀÙ ±æÀÌ¿Í µÎ²², ²É¹Þħ ±æÀÌ¿Í µÎ²²
    vectors_org = vectors_org[:,:] # normal accuracy
elif imode == 1: # 2-º¯¼ö: ²ÉÀÙ ±æÀÌ¿Í µÎ²²
    vectors_org = vectors_org[:,:2]   # normal
elif imode == 2: # 2-º¯¼ö: ²É¹Þħ ±æÀÌ¿Í µÎ²²
    vectors_org = vectors_org[:,2:]   # excellent
elif imode == 3: # 2-º¯¼ö: ±æÀ̸¸ »ç¿ë
    vectors_org = vectors_org[:,[0,2]] # normal
elif imode == 4: # 2-º¯¼ö: µÎ²²¸¸ »ç¿ë
    vectors_org = vectors_org[:,[1,3]] # normal

ind = np.random.permutation(range(len(vectors_org)))
vectors_org, target = vectors_org[ind], target[ind]
num_puntos = len(vectors_org)

vectors = vectors_org.copy()
''' ¸ðµç º¯¼öÀÇ Á¤±ÔÈ­ »ç¿ë '''
if nmode==0:
    vectors = ( vectors - np.mean(vectors, 0) ) / ( np.std(vectors,0) + 1e-7 )

''' k °¡Áö ºÐ·ù ¹®Á¦ '''
# k = np.int( input('How many kind of classification ? '))
k = 3

''' Create a k initial centroids of shape (k, 2) '''
centroids = vectors[np.random.choice(range(num_puntos), k)] # (4,2)

''' Iteration to make classification '''
c_old = centroids.copy()
for step in range(500):
    clss = [ distance(vectors, centroids[r]) for r in range(k) ]
    clss = np.argmin( clss, axis=0 ) # (n, )
    
    ''' Update the centroides by k-means '''
    centroids = np.array([ np.mean( vectors[clss==c, :], axis=0 ) for c in range(k) ])
    if np.sum(distance(centroids, c_old)) < 1.0e-10: break
    c_old = centroids.copy()

''' ºÐ·ù¸¸ ÇÏ¿´Áö Á¾·ùÀ̸§À» ¸ð¸§ => Á¤´ä°ú °°Àº °ÍÀ¸·Î ¸ÅĪ '''
s = {}; t = {}
for j in range(k):
    s[j] = set(ind[clss==j])
    t[j] = set(ind[target==j])
    
clss2 = clss.copy()
for i in range(k):
    for j in range(k):
        p = len( s[i] & t[j] )/(len(vectors)/3)
        if p > 3/5:
            clss[ clss2==i ] = j            
            break

ans = np.sum( target == clss )
''' Result '''
print('')
print(35*'--')
if imode == 0:  # 4°³ÀÇ º¯¼ö »ç¿ë: ²ÉÀÙ ±æÀÌ¿Í µÎ²², ²É¹Þħ ±æÀÌ¿Í µÎ²²
    print('º×²É ºÐ·ù °á°ú( 4-º¯¼ö: ²ÉÀÙ ±æÀÌ¿Í µÎ²², ²É¹Þħ ±æÀÌ¿Í µÎ²² )')
elif imode == 1: # 2-º¯¼ö: ²ÉÀÙ ±æÀÌ¿Í µÎ²²
    print('º×²É ºÐ·ù °á°ú( 2-º¯¼ö: ²ÉÀÙ ±æÀÌ¿Í µÎ²² )')
elif imode == 2: # 2-º¯¼ö: ²É¹Þħ ±æÀÌ¿Í µÎ²²
    print('º×²É ºÐ·ù °á°ú( 2-º¯¼ö: ²É¹Þħ ±æÀÌ¿Í µÎ²² )')
elif imode == 3: # 2-º¯¼ö: ±æÀ̸¸ »ç¿ë
    print('º×²É ºÐ·ù °á°ú( 2-º¯¼ö: ²ÉÀÙ°ú ²É¹Þħ ±æÀÌ )')
elif imode == 4: # 2-º¯¼ö: µÎ²²¸¸ »ç¿ë
    print('º×²É ºÐ·ù °á°ú( 2-º¯¼ö: ²ÉÀÙ°ú ²É¹Þħ µÎ²² )')
print(35*'--')

print('The nbr of clustering points = ', num_puntos )
print('Nbr. of each exact class = ', [np.sum(target==c) for c in range(3)])
print('Nbr. of each cluster = ', [np.sum(clss==c) for c in range(3)])
print('The number of k-means iteration = ', step )
print('The probability of matching = ', ans/num_puntos )

# gmode = 1
if gmode==0:
    ''' Constructing a dictionary to plot '''
    vectors = vectors_org.copy()
    if imode==0:
        data = {"X_l": vectors[:,0], "X_w": vectors[:,1], "Y_l": vectors[:,2],
                "Y_w": vectors[:,3] }
    else:
        data = {"X_l": vectors[:,0], "X_w": vectors[:,1] }
    
    ''' Plot using seaborn '''
    df = pd.DataFrame(data)
    dfname = df.copy(); dfname['Name'] = dataset.target_names[target]
    dfassign = df.copy(); dfassign['cluster'] = dataset.target_names[clss]
    
    plt.figure(1); plt.clf();
    pd.plotting.radviz(dfname, 'Name')  # multivariate data viualization
    plt.suptitle('Using exact target label')
    plt.figure(2); plt.clf()
    pd.plotting.radviz(dfassign, 'cluster')
    plt.suptitle('Using k-means clustering')
    
    
    pd.plotting.scatter_matrix(df, c=target, figsize=(9, 9), marker='o',
                hist_kwds={'bins': 20}, s=60, alpha=.8)
    plt.suptitle('Using exact target label')
    pd.plotting.scatter_matrix(df, c=clss, figsize=(9, 9), marker='o',
                hist_kwds={'bins': 20}, s=60, alpha=.8)
    plt.suptitle('Using k-means clustering')
    
    if imode==0:
        sepal = np.sum(vectors[:,:2], axis=1); petal = np.sum(vectors[:,2:], axis=1)
    else:
        sepal = vectors[:,0]; petal = vectors[:,1]
        
    data = {"x": sepal, "y": petal, "target": target }
    df2 = pd.DataFrame(data)
    sns.lmplot("x", "y", data=df2, fit_reg=False, size=6, hue="target", legend=True)
    plt.suptitle('Using exact target label')
    
    data_sum = {"x": sepal, "y": petal, "cluster": clss }
    df3 = pd.DataFrame(data_sum)
    sns.lmplot("x", "y", data=df3, fit_reg=False, size=6, hue="cluster", legend=True)
    plt.suptitle('Using k-means clustering')

°Ô½Ã¹°À» À̸ÞÀÏ·Î º¸³»±â ÇÁ¸°Æ®Ãâ·ÂÀ» À§ÇÑ È­¸éº¸±â
DATE: 2019.07.05 - 15:10


 ÀÌÀü±Û ÀÔ·ÂÆÄÀÏ for mnist using tensorflow
 ´ÙÀ½±Û k-means clustering using tensorflow
±Û³²±â±â»èÁ¦Çϱâ¼öÁ¤Çϱâ´äº¯´Þ±â°Ë»ö¸ñ·Ï º¸±â