Research Board

Name	신병춘

Subject	k-means clustering for the classification of iris

# k_means_iris.py
# K-means Clustering(군집화) for iris

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
# from iris_ready import *   # load dataset from iris_ready.py

def distance(A, B): # column-wise, broadcasting
    return np.sqrt( np.sum((A - B)**2, axis=1) )

''' Call data '''
from sklearn.datasets import load_iris
dataset = load_iris()    # dict
vectors_org, target = np.array(dataset['data']), np.array(dataset['target'])

''' 사용할 변수 선택 '''
''' imode = 0: 모두, 1: 꽃잎, 2: 꽃받침, 3: 길이, 4: 두께 '''
imode = np.int(input('imode( 0: 모두, 1: 꽃잎, 2: 꽃받침, 3: 길이, 4: 두께 )  = '))
nmode = np.int(input('nmode( 정규화 0: 하기, 1: 안하기 )  = '))
gmode = np.int(input('gmode( 1: 결과만 보기,  0: 그림도 보기 )  = '))

if imode == 0:  # 4개의 변수 사용: 꽃잎 길이와 두께, 꽃받침 길이와 두께
    vectors_org = vectors_org[:,:] # normal accuracy
elif imode == 1: # 2-변수: 꽃잎 길이와 두께
    vectors_org = vectors_org[:,:2]   # normal
elif imode == 2: # 2-변수: 꽃받침 길이와 두께
    vectors_org = vectors_org[:,2:]   # excellent
elif imode == 3: # 2-변수: 길이만 사용
    vectors_org = vectors_org[:,[0,2]] # normal
elif imode == 4: # 2-변수: 두께만 사용
    vectors_org = vectors_org[:,[1,3]] # normal

ind = np.random.permutation(range(len(vectors_org)))
vectors_org, target = vectors_org[ind], target[ind]
num_puntos = len(vectors_org)

vectors = vectors_org.copy()
''' 모든 변수의 정규화 사용 '''
if nmode==0:
    vectors = ( vectors - np.mean(vectors, 0) ) / ( np.std(vectors,0) + 1e-7 )

''' k 가지 분류 문제 '''
# k = np.int( input('How many kind of classification ? '))
k = 3

''' Create a k initial centroids of shape (k, 2) '''
centroids = vectors[np.random.choice(range(num_puntos), k)] # (4,2)

''' Iteration to make classification '''
c_old = centroids.copy()
for step in range(500):
    clss = [ distance(vectors, centroids[r]) for r in range(k) ]
    clss = np.argmin( clss, axis=0 ) # (n, )

    ''' Update the centroides by k-means '''
    centroids = np.array([ np.mean( vectors[clss==c, :], axis=0 ) for c in range(k) ])
    if np.sum(distance(centroids, c_old)) < 1.0e-10: break
    c_old = centroids.copy()

''' 분류만 하였지 종류이름을 모름 => 정답과 같은 것으로 매칭 '''
s = {}; t = {}
for j in range(k):
    s[j] = set(ind[clss==j])
    t[j] = set(ind[target==j])

clss2 = clss.copy()
for i in range(k):
    for j in range(k):
        p = len( s[i] & t[j] )/(len(vectors)/3)
        if p > 3/5:
            clss[ clss2==i ] = j
            break

ans = np.sum( target == clss )
''' Result '''
print('')
print(35*'--')
if imode == 0:  # 4개의 변수 사용: 꽃잎 길이와 두께, 꽃받침 길이와 두께
    print('붓꽃 분류 결과( 4-변수: 꽃잎 길이와 두께, 꽃받침 길이와 두께 )')
elif imode == 1: # 2-변수: 꽃잎 길이와 두께
    print('붓꽃 분류 결과( 2-변수: 꽃잎 길이와 두께 )')
elif imode == 2: # 2-변수: 꽃받침 길이와 두께
    print('붓꽃 분류 결과( 2-변수: 꽃받침 길이와 두께 )')
elif imode == 3: # 2-변수: 길이만 사용
    print('붓꽃 분류 결과( 2-변수: 꽃잎과 꽃받침 길이 )')
elif imode == 4: # 2-변수: 두께만 사용
    print('붓꽃 분류 결과( 2-변수: 꽃잎과 꽃받침 두께 )')
print(35*'--')

print('The nbr of clustering points = ', num_puntos )
print('Nbr. of each exact class = ', [np.sum(target==c) for c in range(3)])
print('Nbr. of each cluster = ', [np.sum(clss==c) for c in range(3)])
print('The number of k-means iteration = ', step )
print('The probability of matching = ', ans/num_puntos )

# gmode = 1
if gmode==0:
    ''' Constructing a dictionary to plot '''
    vectors = vectors_org.copy()
    if imode==0:
        data = {"X_l": vectors[:,0], "X_w": vectors[:,1], "Y_l": vectors[:,2],
                "Y_w": vectors[:,3] }
    else:
        data = {"X_l": vectors[:,0], "X_w": vectors[:,1] }

    ''' Plot using seaborn '''
    df = pd.DataFrame(data)
    dfname = df.copy(); dfname['Name'] = dataset.target_names[target]
    dfassign = df.copy(); dfassign['cluster'] = dataset.target_names[clss]

    plt.figure(1); plt.clf();
    pd.plotting.radviz(dfname, 'Name')  # multivariate data viualization
    plt.suptitle('Using exact target label')
    plt.figure(2); plt.clf()
    pd.plotting.radviz(dfassign, 'cluster')
    plt.suptitle('Using k-means clustering')


    pd.plotting.scatter_matrix(df, c=target, figsize=(9, 9), marker='o',
                hist_kwds={'bins': 20}, s=60, alpha=.8)
    plt.suptitle('Using exact target label')
    pd.plotting.scatter_matrix(df, c=clss, figsize=(9, 9), marker='o',
                hist_kwds={'bins': 20}, s=60, alpha=.8)
    plt.suptitle('Using k-means clustering')

    if imode==0:
        sepal = np.sum(vectors[:,:2], axis=1); petal = np.sum(vectors[:,2:], axis=1)
    else:
        sepal = vectors[:,0]; petal = vectors[:,1]

    data = {"x": sepal, "y": petal, "target": target }
    df2 = pd.DataFrame(data)
    sns.lmplot("x", "y", data=df2, fit_reg=False, size=6, hue="target", legend=True)
    plt.suptitle('Using exact target label')

    data_sum = {"x": sepal, "y": petal, "cluster": clss }
    df3 = pd.DataFrame(data_sum)
    sns.lmplot("x", "y", data=df3, fit_reg=False, size=6, hue="cluster", legend=True)
    plt.suptitle('Using k-means clustering')

DATE: 2019.07.05 - 15:10

입력파일 for mnist using tensorflow

k-means clustering using tensorflow