# k_means_iris.py # K-means Clustering(±ºÁýÈ) for iris
import numpy as np import matplotlib.pyplot as plt import pandas as pd import seaborn as sns # from iris_ready import * # load dataset from iris_ready.py
def distance(A, B): # column-wise, broadcasting return np.sqrt( np.sum((A - B)**2, axis=1) )
''' Call data ''' from sklearn.datasets import load_iris dataset = load_iris() # dict vectors_org, target = np.array(dataset['data']), np.array(dataset['target'])
''' »ç¿ëÇÒ º¯¼ö ¼±Åà ''' ''' imode = 0: ¸ðµÎ, 1: ²ÉÀÙ, 2: ²É¹Þħ, 3: ±æÀÌ, 4: µÎ²² ''' imode = np.int(input('imode( 0: ¸ðµÎ, 1: ²ÉÀÙ, 2: ²É¹Þħ, 3: ±æÀÌ, 4: µÎ²² ) = ')) nmode = np.int(input('nmode( Á¤±ÔÈ 0: Çϱâ, 1: ¾ÈÇϱâ ) = ')) gmode = np.int(input('gmode( 1: °á°ú¸¸ º¸±â, 0: ±×¸²µµ º¸±â ) = '))
if imode == 0: # 4°³ÀÇ º¯¼ö »ç¿ë: ²ÉÀÙ ±æÀÌ¿Í µÎ²², ²É¹Þħ ±æÀÌ¿Í µÎ²² vectors_org = vectors_org[:,:] # normal accuracy elif imode == 1: # 2-º¯¼ö: ²ÉÀÙ ±æÀÌ¿Í µÎ²² vectors_org = vectors_org[:,:2] # normal elif imode == 2: # 2-º¯¼ö: ²É¹Þħ ±æÀÌ¿Í µÎ²² vectors_org = vectors_org[:,2:] # excellent elif imode == 3: # 2-º¯¼ö: ±æÀ̸¸ »ç¿ë vectors_org = vectors_org[:,[0,2]] # normal elif imode == 4: # 2-º¯¼ö: µÎ²²¸¸ »ç¿ë vectors_org = vectors_org[:,[1,3]] # normal
ind = np.random.permutation(range(len(vectors_org))) vectors_org, target = vectors_org[ind], target[ind] num_puntos = len(vectors_org)
vectors = vectors_org.copy() ''' ¸ðµç º¯¼öÀÇ Á¤±ÔÈ »ç¿ë ''' if nmode==0: vectors = ( vectors - np.mean(vectors, 0) ) / ( np.std(vectors,0) + 1e-7 )
''' k °¡Áö ºÐ·ù ¹®Á¦ ''' # k = np.int( input('How many kind of classification ? ')) k = 3
''' Create a k initial centroids of shape (k, 2) ''' centroids = vectors[np.random.choice(range(num_puntos), k)] # (4,2)
''' Iteration to make classification ''' c_old = centroids.copy() for step in range(500): clss = [ distance(vectors, centroids[r]) for r in range(k) ] clss = np.argmin( clss, axis=0 ) # (n, ) ''' Update the centroides by k-means ''' centroids = np.array([ np.mean( vectors[clss==c, :], axis=0 ) for c in range(k) ]) if np.sum(distance(centroids, c_old)) < 1.0e-10: break c_old = centroids.copy()
''' ºÐ·ù¸¸ ÇÏ¿´Áö Á¾·ùÀ̸§À» ¸ð¸§ => Á¤´ä°ú °°Àº °ÍÀ¸·Î ¸ÅĪ ''' s = {}; t = {} for j in range(k): s[j] = set(ind[clss==j]) t[j] = set(ind[target==j]) clss2 = clss.copy() for i in range(k): for j in range(k): p = len( s[i] & t[j] )/(len(vectors)/3) if p > 3/5: clss[ clss2==i ] = j break
ans = np.sum( target == clss ) ''' Result ''' print('') print(35*'--') if imode == 0: # 4°³ÀÇ º¯¼ö »ç¿ë: ²ÉÀÙ ±æÀÌ¿Í µÎ²², ²É¹Þħ ±æÀÌ¿Í µÎ²² print('º×²É ºÐ·ù °á°ú( 4-º¯¼ö: ²ÉÀÙ ±æÀÌ¿Í µÎ²², ²É¹Þħ ±æÀÌ¿Í µÎ²² )') elif imode == 1: # 2-º¯¼ö: ²ÉÀÙ ±æÀÌ¿Í µÎ²² print('º×²É ºÐ·ù °á°ú( 2-º¯¼ö: ²ÉÀÙ ±æÀÌ¿Í µÎ²² )') elif imode == 2: # 2-º¯¼ö: ²É¹Þħ ±æÀÌ¿Í µÎ²² print('º×²É ºÐ·ù °á°ú( 2-º¯¼ö: ²É¹Þħ ±æÀÌ¿Í µÎ²² )') elif imode == 3: # 2-º¯¼ö: ±æÀ̸¸ »ç¿ë print('º×²É ºÐ·ù °á°ú( 2-º¯¼ö: ²ÉÀÙ°ú ²É¹Þħ ±æÀÌ )') elif imode == 4: # 2-º¯¼ö: µÎ²²¸¸ »ç¿ë print('º×²É ºÐ·ù °á°ú( 2-º¯¼ö: ²ÉÀÙ°ú ²É¹Þħ µÎ²² )') print(35*'--')
print('The nbr of clustering points = ', num_puntos ) print('Nbr. of each exact class = ', [np.sum(target==c) for c in range(3)]) print('Nbr. of each cluster = ', [np.sum(clss==c) for c in range(3)]) print('The number of k-means iteration = ', step ) print('The probability of matching = ', ans/num_puntos )
# gmode = 1 if gmode==0: ''' Constructing a dictionary to plot ''' vectors = vectors_org.copy() if imode==0: data = {"X_l": vectors[:,0], "X_w": vectors[:,1], "Y_l": vectors[:,2], "Y_w": vectors[:,3] } else: data = {"X_l": vectors[:,0], "X_w": vectors[:,1] } ''' Plot using seaborn ''' df = pd.DataFrame(data) dfname = df.copy(); dfname['Name'] = dataset.target_names[target] dfassign = df.copy(); dfassign['cluster'] = dataset.target_names[clss] plt.figure(1); plt.clf(); pd.plotting.radviz(dfname, 'Name') # multivariate data viualization plt.suptitle('Using exact target label') plt.figure(2); plt.clf() pd.plotting.radviz(dfassign, 'cluster') plt.suptitle('Using k-means clustering') pd.plotting.scatter_matrix(df, c=target, figsize=(9, 9), marker='o', hist_kwds={'bins': 20}, s=60, alpha=.8) plt.suptitle('Using exact target label') pd.plotting.scatter_matrix(df, c=clss, figsize=(9, 9), marker='o', hist_kwds={'bins': 20}, s=60, alpha=.8) plt.suptitle('Using k-means clustering') if imode==0: sepal = np.sum(vectors[:,:2], axis=1); petal = np.sum(vectors[:,2:], axis=1) else: sepal = vectors[:,0]; petal = vectors[:,1] data = {"x": sepal, "y": petal, "target": target } df2 = pd.DataFrame(data) sns.lmplot("x", "y", data=df2, fit_reg=False, size=6, hue="target", legend=True) plt.suptitle('Using exact target label') data_sum = {"x": sepal, "y": petal, "cluster": clss } df3 = pd.DataFrame(data_sum) sns.lmplot("x", "y", data=df3, fit_reg=False, size=6, hue="cluster", legend=True) plt.suptitle('Using k-means clustering')
|
|
|