# import the data set Diabetes. Change the path to the location of your data file import pandas as pd # File location in my Mac # df = pd.read_excel (r'/Users/cyu/Documents/Diabetes.xlsx') # File location in my Windows df = pd.read_excel (r'G:\JUMP\553\Lecture_PowerPoint\Unit_08_cluster_analysis\Python_files\Diabetes.xlsx') # put the variables into an array object import numpy as np Profile = np.array(list(zip(df.Age, df.BMI, df.BP, df.HDL, df.TCH, df.LTG, df. Glucose))) # import the libraries for k-mean clustering, spatial metrics, and plotting from sklearn.cluster import KMeans from sklearn import metrics from scipy.spatial.distance import cdist import matplotlib.pyplot as plt # initialize the value of distortion as blank distortions = [] # determine k using a loop K = range(1,10) for k in K: kmeanModel = KMeans(n_clusters=k).fit(Profile) kmeanModel.fit(Profile) distortions.append(sum(np.min(cdist(Profile, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / Profile.shape[0]) # Plot the elbow to show the optimal k plt.plot(K, distortions, 'bx-') plt.xlabel('k') plt.ylabel('Distortion') plt.title('The Elbow graph showing the optimal k') plt.show()